In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from  matplotlib.ticker import PercentFormatter

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

from itertools import cycle
from sklearn.metrics import auc, roc_curve
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [ ]:
#Import AirBnB listings data from Austin
df = pd.read_csv('/content/drive/MyDrive/airbnb_listings_austin.csv')
In [ ]:
#See data types in data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5835 entries, 0 to 5834
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5835 non-null   int64  
 1   listing_url                  5835 non-null   object 
 2   name                         5835 non-null   object 
 3   summary                      5373 non-null   object 
 4   space                        4475 non-null   object 
 5   description                  5832 non-null   object 
 6   experiences_offered          5835 non-null   object 
 7   neighborhood_overview        3572 non-null   object 
 8   notes                        2412 non-null   object 
 9   transit                      3492 non-null   object 
 10  host_id                      5835 non-null   int64  
 11  host_name                    5820 non-null   object 
 12  host_since                   5820 non-null   object 
 13  host_location                5810 non-null   object 
 14  host_about                   3974 non-null   object 
 15  host_response_time           4177 non-null   object 
 16  host_response_rate           4177 non-null   object 
 17  host_is_superhost            5820 non-null   object 
 18  host_listings_count          5820 non-null   float64
 19  host_has_profile_pic         5820 non-null   object 
 20  host_identity_verified       5820 non-null   object 
 21  neighbourhood                4800 non-null   object 
 22  city                         5835 non-null   object 
 23  property_type                5835 non-null   object 
 24  room_type                    5835 non-null   object 
 25  accommodates                 5835 non-null   int64  
 26  bathrooms                    5789 non-null   float64
 27  bedrooms                     5829 non-null   float64
 28  beds                         5812 non-null   float64
 29  bed_type                     5835 non-null   object 
 30  amenities                    5835 non-null   object 
 31  square_feet                  302 non-null    float64
 32  price                        5835 non-null   object 
 33  weekly_price                 2227 non-null   object 
 34  security_deposit             2770 non-null   object 
 35  cleaning_fee                 3587 non-null   object 
 36  guests_included              5835 non-null   int64  
 37  extra_people                 5835 non-null   object 
 38  minimum_nights               5835 non-null   int64  
 39  has_availability             5835 non-null   object 
 40  availability_30              5835 non-null   int64  
 41  availability_60              5835 non-null   int64  
 42  availability_90              5835 non-null   int64  
 43  availability_365             5835 non-null   int64  
 44  number_of_reviews            5835 non-null   int64  
 45  review_scores_rating         3789 non-null   float64
 46  review_scores_accuracy       3776 non-null   float64
 47  review_scores_cleanliness    3778 non-null   float64
 48  review_scores_checkin        3778 non-null   float64
 49  review_scores_communication  3778 non-null   float64
 50  review_scores_location       3779 non-null   float64
 51  review_scores_value          3778 non-null   float64
 52  instant_bookable             5835 non-null   object 
 53  cancellation_policy          5835 non-null   object 
dtypes: float64(12), int64(10), object(32)
memory usage: 2.4+ MB
In [ ]:
#Get number of nulls in each column
df.isnull().sum()
Out[ ]:
id                                0
listing_url                       0
name                              0
summary                         462
space                          1360
description                       3
experiences_offered               0
neighborhood_overview          2263
notes                          3423
transit                        2343
host_id                           0
host_name                        15
host_since                       15
host_location                    25
host_about                     1861
host_response_time             1658
host_response_rate             1658
host_is_superhost                15
host_listings_count              15
host_has_profile_pic             15
host_identity_verified           15
neighbourhood                  1035
city                              0
property_type                     0
room_type                         0
accommodates                      0
bathrooms                        46
bedrooms                          6
beds                             23
bed_type                          0
amenities                         0
square_feet                    5533
price                             0
weekly_price                   3608
security_deposit               3065
cleaning_fee                   2248
guests_included                   0
extra_people                      0
minimum_nights                    0
has_availability                  0
availability_30                   0
availability_60                   0
availability_90                   0
availability_365                  0
number_of_reviews                 0
review_scores_rating           2046
review_scores_accuracy         2059
review_scores_cleanliness      2057
review_scores_checkin          2057
review_scores_communication    2057
review_scores_location         2056
review_scores_value            2057
instant_bookable                  0
cancellation_policy               0
dtype: int64
In [ ]:
#Get number of unqiue values or types in each column
#This helps with deciding what to make dummies of for non-numerical data
df.nunique()
Out[ ]:
id                             5835
listing_url                    5835
name                           5784
summary                        5261
space                          4421
description                    5791
experiences_offered               1
neighborhood_overview          3379
notes                          2155
transit                        3306
host_id                        4633
host_name                      1888
host_since                     1578
host_location                   171
host_about                     2946
host_response_time                4
host_response_rate               50
host_is_superhost                 2
host_listings_count              24
host_has_profile_pic              2
host_identity_verified            2
neighbourhood                    79
city                             12
property_type                    18
room_type                         3
accommodates                     16
bathrooms                        16
bedrooms                         10
beds                             16
bed_type                          5
amenities                      4474
square_feet                     108
price                           468
weekly_price                    475
security_deposit                 76
cleaning_fee                    100
guests_included                  16
extra_people                     59
minimum_nights                   26
has_availability                  1
availability_30                  31
availability_60                  61
availability_90                  91
availability_365                362
number_of_reviews               163
review_scores_rating             41
review_scores_accuracy            8
review_scores_cleanliness         9
review_scores_checkin             7
review_scores_communication       7
review_scores_location            7
review_scores_value               8
instant_bookable                  2
cancellation_policy               5
dtype: int64
In [ ]:
#Brief view of data
df
Out[ ]:
id listing_url name summary space description experiences_offered neighborhood_overview notes transit ... number_of_reviews review_scores_rating review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable cancellation_policy
0 72635 https://www.airbnb.com/rooms/72635 3 Private Bedrooms, SW Austin Conveniently located 10-15 from downtown in SW... We have three spare bedrooms, each with a quee... Conveniently located 10-15 from downtown in SW... none Location and convenience are key. Easy access... NaN Unfortunately there is no convenient public tr... ... 1 100.0 10.0 10.0 10.0 10.0 10.0 10.0 f moderate
1 5386323 https://www.airbnb.com/rooms/5386323 Cricket Trailer Rent this cool concept trailer that has everyt... Rental arrangements for this trailer allows yo... Rent this cool concept trailer that has everyt... none We're talking about wherever you'd like in the... NaN Bike, Bus, Metrorail, etc. you name it we've g... ... 0 NaN NaN NaN NaN NaN NaN NaN f moderate
2 8826517 https://www.airbnb.com/rooms/8826517 Private room 1 in South Austin Upstairs, private, 12ft x 13 1/2ft room. Priv... NaN Upstairs, private, 12ft x 13 1/2ft room. Priv... none NaN NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN f flexible
3 8828616 https://www.airbnb.com/rooms/8828616 Private room 2 in South Austin Upstairs, private, 11ft x 13 1/2ft room. Priv... NaN Upstairs, private, 11ft x 13 1/2ft room. Priv... none NaN NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN f flexible
4 8536913 https://www.airbnb.com/rooms/8536913 Brand-New 3BR Austin Home Brand-new 3BR/2BA Austin home with landscaped ... Feel instantly at home at our brand new 3BR/2B... Brand-new 3BR/2BA Austin home with landscaped ... none Entertainment and activities are plentiful her... NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN f strict
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5830 6063670 https://www.airbnb.com/rooms/6063670 Austin's Downtown Garden Suite Enjoy being literally steps from everything th... If you are looking for the perfect suite in th... Enjoy being literally steps from everything th... none I love that the downtown neighborhood is so vi... If you are interested in hosting an even large... In addition to the Airport Flyer that I alread... ... 9 100.0 10.0 10.0 10.0 10.0 10.0 9.0 f strict
5831 8422925 https://www.airbnb.com/rooms/8422925 Two beds in Downtown Austin! Prime location for the Austin Convention Cente... Located in the heart of downtown, this room co... Prime location for the Austin Convention Cente... none This truly is in the middle of everything goin... NaN Buses leave from across the street (including ... ... 0 NaN NaN NaN NaN NaN NaN NaN f moderate
5832 3345881 https://www.airbnb.com/rooms/3345881 Casa Romántica en Picos de Europa Axtur: Picos de Europa. Desfiladero del Sella ... Una casa excepcional en un paisaje excepcional... Una casa excepcional en un paisaje excepcional... none Pueblecito asturiano, con muy pocos vecinos, d... Paisaje y tranquilidad. En Coche ... 1 100.0 8.0 10.0 10.0 10.0 10.0 8.0 t strict
5833 8954997 https://www.airbnb.com/rooms/8954997 Living room with bed Living room with bed have bathroom. NaN Living room with bed have bathroom. none NaN NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN f flexible
5834 7618185 https://www.airbnb.com/rooms/7618185 Comfy 1 bedroom in North Austin NaN Cozy one bedroom/one bath 1st floor apartment ... Cozy one bedroom/one bath 1st floor apartment ... none NaN The security deposit may be forfeited in the e... Close to grocery stores, restaurants and a mov... ... 0 NaN NaN NaN NaN NaN NaN NaN f strict

5835 rows × 54 columns

In [ ]:
#Impute columns (vars) with a value like f for false
def impute(value, vars):
  for var in vars:
    df[var][df[var].isnull() == True] = value
    print(f'{var} nulls:', df[var].isnull().sum())
In [ ]:
#Make dummies when a column consists of t for true and f for false
def make_binary_dummies(vars):
  for var in vars:
    df[var][df[var] == 'f'] = 0
    df[var][df[var] == 't'] = 1
    print(df[var].head(20))
In [ ]:
#Make categorical columns into numerical dummies where each category is replaced
#by a number starting from 0
#Used in Pandas apply function
def dummyize(x, columns):
    for column in range(len(columns)):
        if x == columns[column]:
            return column
In [ ]:
#Impute all nulls with false
impute('f', ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable'])
host_is_superhost nulls: 0
host_has_profile_pic nulls: 0
host_identity_verified nulls: 0
instant_bookable nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var].isnull() == True] = value
In [ ]:
#Impute cleaning_fee and security_deposit with $0
impute('$0', ['cleaning_fee', 'security_deposit'])
cleaning_fee nulls: 0
security_deposit nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var].isnull() == True] = value
In [ ]:
#Impute bathrooms, bedrooms, and beds with 1 of each
impute(1, ['bathrooms', 'bedrooms', 'beds'])
bathrooms nulls: 0
bedrooms nulls: 0
beds nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var].isnull() == True] = value
In [ ]:
#Impute missing host since with  12/6/2023
impute('12/6/2023', ['host_since'])
host_since nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var].isnull() == True] = value
In [ ]:
#Convert price columns into floats
for var in ['price', 'weekly_price', 'security_deposit', 'extra_people', 'cleaning_fee']:
  df[var] = df[var].replace('\$', '', regex=True).replace(',', '', regex=True).astype(float)
  print(df[var].head())
0    300.0
1     99.0
2    100.0
3    100.0
4    599.0
Name: price, dtype: float64
0      NaN
1    600.0
2      NaN
3      NaN
4      NaN
Name: weekly_price, dtype: float64
0       0.0
1    1000.0
2       0.0
3       0.0
4       0.0
Name: security_deposit, dtype: float64
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: extra_people, dtype: float64
0      0.0
1     75.0
2      0.0
3      0.0
4    125.0
Name: cleaning_fee, dtype: float64
In [ ]:
#Store current weekly price values for checking if imputing was done correctly
#where the null were
df['old_weekly_price'] = df['weekly_price']
In [ ]:
#Impute weekly_price with the price times 7
df['weekly_price'][df['weekly_price'].isnull() == True] = df['price'] * 7
print('weekly_price nulls:', df[var].isnull().sum())
weekly_price nulls: 0
<ipython-input-894-41f3e5500f06>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weekly_price'][df['weekly_price'].isnull() == True] = df['price'] * 7
In [ ]:
#Check that the imputation was done correctly
df[['price', 'weekly_price']][df['old_weekly_price'].isnull() == True].head(60)
Out[ ]:
price weekly_price
0 300.0 2100.0
2 100.0 700.0
3 100.0 700.0
4 599.0 4193.0
5 100.0 700.0
6 54.0 378.0
7 40.0 280.0
11 50.0 350.0
14 55.0 385.0
18 59.0 413.0
20 40.0 280.0
21 150.0 1050.0
22 175.0 1225.0
24 120.0 840.0
25 450.0 3150.0
26 49.0 343.0
29 50.0 350.0
33 200.0 1400.0
36 155.0 1085.0
37 700.0 4900.0
38 819.0 5733.0
39 1200.0 8400.0
40 1250.0 8750.0
45 600.0 4200.0
46 200.0 1400.0
49 629.0 4403.0
51 469.0 3283.0
52 1500.0 10500.0
54 250.0 1750.0
55 499.0 3493.0
62 165.0 1155.0
63 115.0 805.0
64 909.0 6363.0
69 350.0 2450.0
70 130.0 910.0
73 600.0 4200.0
75 65.0 455.0
78 450.0 3150.0
79 6500.0 45500.0
80 249.0 1743.0
84 300.0 2100.0
87 88.0 616.0
90 200.0 1400.0
91 150.0 1050.0
93 625.0 4375.0
95 250.0 1750.0
97 119.0 833.0
99 199.0 1393.0
100 97.0 679.0
103 118.0 826.0
104 163.0 1141.0
105 400.0 2800.0
106 125.0 875.0
107 169.0 1183.0
109 300.0 2100.0
110 75.0 525.0
111 60.0 420.0
112 500.0 3500.0
113 157.0 1099.0
116 399.0 2793.0
In [ ]:
df['host_location'].head(60)
Out[ ]:
0          Austin, Texas, United States
1          Austin, Texas, United States
2          Austin, Texas, United States
3          Austin, Texas, United States
4                                    US
5          Austin, Texas, United States
6          Austin, Texas, United States
7          Austin, Texas, United States
8                                    US
9          Austin, Texas, United States
10                                   US
11         Austin, Texas, United States
12         Austin, Texas, United States
13         Austin, Texas, United States
14         Austin, Texas, United States
15         Austin, Texas, United States
16         Austin, Texas, United States
17         Austin, Texas, United States
18         Austin, Texas, United States
19         Austin, Texas, United States
20         Austin, Texas, United States
21         Austin, Texas, United States
22         Austin, Texas, United States
23         Austin, Texas, United States
24                                   US
25                                   US
26         Austin, Texas, United States
27         Austin, Texas, United States
28         Austin, Texas, United States
29         Austin, Texas, United States
30         Austin, Texas, United States
31         Austin, Texas, United States
32                                   US
33         Austin, Texas, United States
34         Austin, Texas, United States
35         Austin, Texas, United States
36         Austin, Texas, United States
37                                   US
38                                   US
39         Austin, Texas, United States
40         Austin, Texas, United States
41                                   US
42                                   US
43                                   US
44         Austin, Texas, United States
45                                   US
46         Austin, Texas, United States
47    New York, New York, United States
48         Austin, Texas, United States
49                                   US
50         Austin, Texas, United States
51                                   US
52                                   US
53                                   US
54                                   US
55                                   US
56         Austin, Texas, United States
57         Austin, Texas, United States
58         Austin, Texas, United States
59         Austin, Texas, United States
Name: host_location, dtype: object
In [ ]:
#Make host_location 1 for if a host is from Austin and 0 if the host is not
#List of cities that are or the suburbs of Austin in the data
Austin_city_list = ['Austin, Texas, United States', 'Austin, Texas', 'Austin',\
                    'Dripping Springs, Texas, United States', 'Sunset Valley, Texas, United States', \
                    'West Lake Hills, Texas, United States', 'Round Rock, Texas, United States', \
                    'Pflugerville, Texas, United States']

df['host_location'][df['host_location'].isin(Austin_city_list)] = 1
df['host_location'][df['host_location'] != 1] = 0
df['host_location'].head(60)
<ipython-input-897-77b4485cd734>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['host_location'][df['host_location'].isin(Austin_city_list)] = 1
<ipython-input-897-77b4485cd734>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['host_location'][df['host_location'] != 1] = 0
Out[ ]:
0     1
1     1
2     1
3     1
4     0
5     1
6     1
7     1
8     0
9     1
10    0
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    0
25    0
26    1
27    1
28    1
29    1
30    1
31    1
32    0
33    1
34    1
35    1
36    1
37    0
38    0
39    1
40    1
41    0
42    0
43    0
44    1
45    0
46    1
47    0
48    1
49    0
50    1
51    0
52    0
53    0
54    0
55    0
56    1
57    1
58    1
59    1
Name: host_location, dtype: object
In [ ]:
#Drop listing from Spain
df['city'].drop(df['city'].str.contains('Cangas de Onís').index, inplace=True)
In [ ]:
#Drop columns that will not be used in making models
#Drop temporary old_weekly_price column
df.drop(['old_weekly_price', 'experiences_offered', 'has_availability', 'id', \
          'listing_url', 'name', 'summary', 'city', 'description', 'notes', \
          'neighborhood_overview', 'host_about', 'amenities', 'transit', 'host_id', \
          'host_name', 'host_about', 'neighbourhood', 'square_feet', 'space'], axis = 1, inplace=True)
In [ ]:
df.dropna(subset=['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', \
          'review_scores_checkin', 'review_scores_communication', 'review_scores_location', \
          'review_scores_value', 'host_response_time', 'host_response_rate'], inplace=True)
In [ ]:
make_binary_dummies(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable'])
7     0
9     1
11    0
12    1
13    1
14    1
16    0
19    1
23    0
26    1
27    0
28    0
29    0
34    1
35    0
37    0
39    0
40    0
44    1
46    0
Name: host_is_superhost, dtype: object
7     1
9     1
11    1
12    1
13    1
14    1
16    1
19    1
23    1
26    1
27    1
28    1
29    1
34    1
35    1
37    1
39    1
40    1
44    1
46    1
Name: host_has_profile_pic, dtype: object
7     1
9     1
11    1
12    1
13    1
14    0
16    1
19    1
23    1
26    1
27    1
28    1
29    1
34    1
35    1
37    1
39    1
40    1
44    1
46    1
Name: host_identity_verified, dtype: object
7     1
9     0
11    0
12    0
13    0
14    0
16    0
19    0
23    0
26    0
27    0
28    0
29    0
34    0
35    0
37    0
39    0
40    0
44    1
46    0
Name: instant_bookable, dtype: object
<ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 'f'] = 0
<ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 't'] = 1
<ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 'f'] = 0
<ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 't'] = 1
<ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 'f'] = 0
<ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 't'] = 1
<ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 'f'] = 0
<ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var][df[var] == 't'] = 1
In [ ]:
#Get unique room types
df["room_type"].unique()
Out[ ]:
array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)
In [ ]:
#Get unique room types
df["cancellation_policy"].unique()
Out[ ]:
array(['moderate', 'strict', 'flexible', 'super_strict_30', 'no_refunds'],
      dtype=object)
In [ ]:
#Convert categorical columns into dummies
df["property_type"] = df["property_type"].apply(lambda x: dummyize(x, pd.get_dummies(df['property_type']).columns))
df["room_type"] = df["room_type"].apply(lambda x: dummyize(x, pd.get_dummies(df['room_type']).columns))
df["bed_type"] = df["bed_type"].apply(lambda x: dummyize(x, pd.get_dummies(df['bed_type']).columns))
df["cancellation_policy"] = df["cancellation_policy"].apply(lambda x: dummyize(x, pd.get_dummies(df['cancellation_policy']).columns))
df["host_response_time"] = df["host_response_time"].apply(lambda x: dummyize(x, pd.get_dummies(df['host_response_time']).columns))
In [ ]:
#See dummies of those room types
df["room_type"].unique()
Out[ ]:
array([1, 0, 2])
In [ ]:
#Get unique room types
df["cancellation_policy"].unique()
Out[ ]:
array([1, 3, 0, 4, 2])
In [ ]:
#Get rid of the percentage sign and covert the response rate into floats
df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype('float')
In [ ]:
#Convert the categorical variables to type category so they can be used in
#logistic regression models without issue
for column in ['host_location', 'host_is_superhost', 'host_has_profile_pic', \
               'host_identity_verified', 'instant_bookable']:
  df[column] = df[column].astype('category')
In [ ]:
#Convert host_since to datetime type
df['host_since'] = pd.to_datetime(df['host_since'])

#Get the year from host_since and convert it to a int since datetime
#can't be used in sci-kit learn models.

#Only have a column for the year as a float since the day or month probably
#won't be useful as their own columns

#Convert to float first because converting directly to int causes all the years
#to become 1970
df['host_since'] = df['host_since'].dt.year.astype('float').astype('int')
In [ ]:
#Check resulting dataframe
df
Out[ ]:
host_since host_location host_response_time host_response_rate host_is_superhost host_listings_count host_has_profile_pic host_identity_verified property_type room_type ... number_of_reviews review_scores_rating review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable cancellation_policy
7 2014 1 3 100.0 0 1.0 1 1 9 1 ... 2 100.0 10.0 10.0 10.0 10.0 10.0 10.0 1 1
9 2012 1 3 100.0 1 1.0 1 1 9 1 ... 20 99.0 10.0 10.0 10.0 10.0 10.0 10.0 0 1
11 2011 1 3 100.0 0 1.0 1 1 9 1 ... 9 93.0 10.0 10.0 10.0 10.0 10.0 10.0 0 3
12 2013 1 3 97.0 1 9.0 1 1 9 1 ... 4 100.0 9.0 10.0 10.0 10.0 10.0 10.0 0 3
13 2013 1 3 97.0 1 9.0 1 1 9 1 ... 7 100.0 10.0 10.0 10.0 10.0 10.0 10.0 0 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5826 2013 1 2 100.0 0 339.0 1 1 0 0 ... 12 85.0 9.0 9.0 9.0 9.0 10.0 8.0 0 3
5827 2011 1 2 87.0 0 19.0 1 1 0 0 ... 13 94.0 9.0 9.0 10.0 9.0 10.0 9.0 0 3
5829 2012 1 2 95.0 0 11.0 1 0 0 0 ... 13 88.0 10.0 10.0 10.0 10.0 10.0 10.0 0 1
5830 2013 1 2 100.0 1 21.0 1 1 0 0 ... 9 100.0 10.0 10.0 10.0 10.0 10.0 9.0 0 3
5832 2014 0 3 100.0 0 5.0 1 0 9 0 ... 1 100.0 8.0 10.0 10.0 10.0 10.0 8.0 1 3

3372 rows × 36 columns

In [ ]:
#Get number of nulls in each column
df.isnull().sum()
Out[ ]:
host_since                     0
host_location                  0
host_response_time             0
host_response_rate             0
host_is_superhost              0
host_listings_count            0
host_has_profile_pic           0
host_identity_verified         0
property_type                  0
room_type                      0
accommodates                   0
bathrooms                      0
bedrooms                       0
beds                           0
bed_type                       0
price                          0
weekly_price                   0
security_deposit               0
cleaning_fee                   0
guests_included                0
extra_people                   0
minimum_nights                 0
availability_30                0
availability_60                0
availability_90                0
availability_365               0
number_of_reviews              0
review_scores_rating           0
review_scores_accuracy         0
review_scores_cleanliness      0
review_scores_checkin          0
review_scores_communication    0
review_scores_location         0
review_scores_value            0
instant_bookable               0
cancellation_policy            0
dtype: int64
In [ ]:
#See data types in data
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3372 entries, 7 to 5832
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   host_since                   3372 non-null   int64   
 1   host_location                3372 non-null   category
 2   host_response_time           3372 non-null   int64   
 3   host_response_rate           3372 non-null   float64 
 4   host_is_superhost            3372 non-null   category
 5   host_listings_count          3372 non-null   float64 
 6   host_has_profile_pic         3372 non-null   category
 7   host_identity_verified       3372 non-null   category
 8   property_type                3372 non-null   int64   
 9   room_type                    3372 non-null   int64   
 10  accommodates                 3372 non-null   int64   
 11  bathrooms                    3372 non-null   float64 
 12  bedrooms                     3372 non-null   float64 
 13  beds                         3372 non-null   float64 
 14  bed_type                     3372 non-null   int64   
 15  price                        3372 non-null   float64 
 16  weekly_price                 3372 non-null   float64 
 17  security_deposit             3372 non-null   float64 
 18  cleaning_fee                 3372 non-null   float64 
 19  guests_included              3372 non-null   int64   
 20  extra_people                 3372 non-null   float64 
 21  minimum_nights               3372 non-null   int64   
 22  availability_30              3372 non-null   int64   
 23  availability_60              3372 non-null   int64   
 24  availability_90              3372 non-null   int64   
 25  availability_365             3372 non-null   int64   
 26  number_of_reviews            3372 non-null   int64   
 27  review_scores_rating         3372 non-null   float64 
 28  review_scores_accuracy       3372 non-null   float64 
 29  review_scores_cleanliness    3372 non-null   float64 
 30  review_scores_checkin        3372 non-null   float64 
 31  review_scores_communication  3372 non-null   float64 
 32  review_scores_location       3372 non-null   float64 
 33  review_scores_value          3372 non-null   float64 
 34  instant_bookable             3372 non-null   category
 35  cancellation_policy          3372 non-null   int64   
dtypes: category(5), float64(17), int64(14)
memory usage: 860.1 KB

Problem 1¶

In [ ]:
 

Problem 2¶

Preprocessing for problem¶

In [ ]:
#Drop availability variables except availability_90
df_prob_2 = df.drop(['availability_30', 'availability_60', 'availability_365'], axis = 1)
In [ ]:
#Make availability_90 into a percentage
print(df_prob_2['availability_90'])
df_prob_2['availability_90'] = df_prob_2['availability_90'] / 90
print(df_prob_2['availability_90'])
7       37
9       89
11      84
12      85
13      89
        ..
5826    80
5827    76
5829    73
5830    65
5832    89
Name: availability_90, Length: 3372, dtype: int64
7       0.411111
9       0.988889
11      0.933333
12      0.944444
13      0.988889
          ...   
5826    0.888889
5827    0.844444
5829    0.811111
5830    0.722222
5832    0.988889
Name: availability_90, Length: 3372, dtype: float64
In [ ]:
def is_booked(percentage):
  if percentage < 0.4:
    return 1
  else:
    return 0

df_prob_2["availability_90"] = df_prob_2["availability_90"].apply(lambda x: is_booked(x))

df_prob_2["availability_90"].head(60)
Out[ ]:
7      0
9      0
11     0
12     0
13     0
14     0
16     0
19     1
23     0
26     0
27     0
28     0
29     0
34     0
35     0
37     0
39     1
40     0
44     0
46     0
50     0
51     0
53     0
56     0
57     0
58     0
60     0
62     0
63     0
65     0
66     0
67     0
68     0
71     0
72     0
75     0
76     0
77     0
83     0
85     0
86     1
87     0
88     0
89     0
93     0
94     1
96     1
100    0
101    1
102    0
103    0
104    0
107    0
108    0
110    0
111    0
113    1
114    0
115    0
116    0
Name: availability_90, dtype: int64
In [ ]:
#Drop number of reviews since common sense tells us that popularity
#will be closely related to how booked an AirBnB is and the models
#will heavily train on it if it is left in for this problem
df_prob_2.drop('number_of_reviews', axis=1, inplace=True)
In [ ]:
#Make availability_90 of type category so it can be used in training models
df_prob_2['availability_90'] = df_prob_2['availability_90'].astype('category')
In [ ]:
df_prob_2_sampled = df_prob_2.groupby('availability_90').apply(lambda s: s.sample(500))
In [ ]:
#Split the data on what the model is learning to predict, whether an AirBnB will be booked
X = df_prob_2_sampled.drop('availability_90', axis=1)

y = df_prob_2_sampled['availability_90']

#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 700 entries, (0, 57) to (1, 2090)
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   host_since                   700 non-null    int64   
 1   host_location                700 non-null    category
 2   host_response_time           700 non-null    int64   
 3   host_response_rate           700 non-null    float64 
 4   host_is_superhost            700 non-null    category
 5   host_listings_count          700 non-null    float64 
 6   host_has_profile_pic         700 non-null    category
 7   host_identity_verified       700 non-null    category
 8   property_type                700 non-null    int64   
 9   room_type                    700 non-null    int64   
 10  accommodates                 700 non-null    int64   
 11  bathrooms                    700 non-null    float64 
 12  bedrooms                     700 non-null    float64 
 13  beds                         700 non-null    float64 
 14  bed_type                     700 non-null    int64   
 15  price                        700 non-null    float64 
 16  weekly_price                 700 non-null    float64 
 17  security_deposit             700 non-null    float64 
 18  cleaning_fee                 700 non-null    float64 
 19  guests_included              700 non-null    int64   
 20  extra_people                 700 non-null    float64 
 21  minimum_nights               700 non-null    int64   
 22  review_scores_rating         700 non-null    float64 
 23  review_scores_accuracy       700 non-null    float64 
 24  review_scores_cleanliness    700 non-null    float64 
 25  review_scores_checkin        700 non-null    float64 
 26  review_scores_communication  700 non-null    float64 
 27  review_scores_location       700 non-null    float64 
 28  review_scores_value          700 non-null    float64 
 29  instant_bookable             700 non-null    category
 30  cancellation_policy          700 non-null    int64   
dtypes: category(5), float64(17), int64(9)
memory usage: 188.6 KB
<class 'pandas.core.series.Series'>
MultiIndex: 700 entries, (0, 57) to (1, 2090)
Series name: availability_90
Non-Null Count  Dtype   
--------------  -----   
700 non-null    category
dtypes: category(1)
memory usage: 43.2 KB

Create And Assess Decision Tree Classifiers¶

Default Tree¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_2 = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)

# fit the model to the training data
dt_prob_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_2, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_prob_2")
Out[ ]:
'decision_tree_default_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_2,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c37c3af0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_2.predict(X_train)
y_pred_test = dt_prob_2.predict(X_test)

y_prob_train = dt_prob_2.predict_proba(X_train)
y_prob_test = dt_prob_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.7586
Precision: 0.7514
Recall.  : 0.7749

 -- test set -- 
Accuracy : 0.5433
Precision: 0.5357
Recall.  : 0.6040
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[259  79]
 [ 90 272]]
[[73 59]
 [78 90]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_2.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                      feature
15    0.186640                        price
5     0.120284          host_listings_count
18    0.089013                 cleaning_fee
2     0.078193           host_response_time
20    0.076561                 extra_people
17    0.063001             security_deposit
9     0.049921                    room_type
13    0.043169                         beds
22    0.043145         review_scores_rating
3     0.038342           host_response_rate
0     0.033662                   host_since
27    0.029936       review_scores_location
28    0.021485          review_scores_value
11    0.019586                    bathrooms
19    0.018006              guests_included
7     0.017736       host_identity_verified
4     0.016144            host_is_superhost
29    0.014918             instant_bookable
16    0.011435                 weekly_price
21    0.010926               minimum_nights
8     0.009458                property_type
12    0.008442                     bedrooms
6     0.000000         host_has_profile_pic
23    0.000000       review_scores_accuracy
24    0.000000    review_scores_cleanliness
25    0.000000        review_scores_checkin
26    0.000000  review_scores_communication
1     0.000000                host_location
14    0.000000                     bed_type
10    0.000000                 accommodates
30    0.000000          cancellation_policy
No description has been provided for this image

Tuned Tree¶

In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune_prob_2 = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune_prob_2, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
print(best_estimator)
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 100}
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=100)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values

dt_tuned_prob_2 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=100, ccp_alpha = 0)

# fit the model to the training data
dt_tuned_prob_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=100)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_prob_2, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_prob_2")
Out[ ]:
'decision_tree_tuned_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_prob_2,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c2a185e0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_prob_2.predict(X_train)
y_pred_test = dt_tuned_prob_2.predict(X_test)

y_prob_train = dt_tuned_prob_2.predict_proba(X_train)
y_prob_test = dt_tuned_prob_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.6100
Precision: 0.6154
Recall.  : 0.5926

 -- test set -- 
Accuracy : 0.5767
Precision: 0.5786
Recall.  : 0.5436
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[219 143]
 [130 208]]
[[92 68]
 [59 81]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_prob_2.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                      feature
5     0.457687          host_listings_count
20    0.410974                 extra_people
8     0.112919                property_type
17    0.018419             security_deposit
0     0.000000                   host_since
29    0.000000             instant_bookable
28    0.000000          review_scores_value
27    0.000000       review_scores_location
26    0.000000  review_scores_communication
25    0.000000        review_scores_checkin
24    0.000000    review_scores_cleanliness
23    0.000000       review_scores_accuracy
22    0.000000         review_scores_rating
21    0.000000               minimum_nights
19    0.000000              guests_included
18    0.000000                 cleaning_fee
15    0.000000                        price
16    0.000000                 weekly_price
1     0.000000                host_location
14    0.000000                     bed_type
13    0.000000                         beds
12    0.000000                     bedrooms
11    0.000000                    bathrooms
10    0.000000                 accommodates
9     0.000000                    room_type
7     0.000000       host_identity_verified
6     0.000000         host_has_profile_pic
4     0.000000            host_is_superhost
3     0.000000           host_response_rate
2     0.000000           host_response_time
30    0.000000          cancellation_policy
No description has been provided for this image

Tree With Lower Min¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_2_2 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=50, ccp_alpha = 0)

# fit the model to the training data
dt_prob_2_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=50)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_2_2, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_lower_min_prob_2")
Out[ ]:
'decision_tree_lower_min_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_2_2,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3852950>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_2_2.predict(X_train)
y_pred_test = dt_prob_2_2.predict(X_test)

y_prob_train = dt_prob_2_2.predict_proba(X_train)
y_prob_test = dt_prob_2_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.6229
Precision: 0.6570
Recall.  : 0.5185

 -- test set -- 
Accuracy : 0.5933
Precision: 0.6063
Recall.  : 0.5168
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[254 169]
 [ 95 182]]
[[101  72]
 [ 50  77]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_2_2.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                      feature
5     0.269645          host_listings_count
20    0.242124                 extra_people
15    0.202653                        price
22    0.131451         review_scores_rating
28    0.078247          review_scores_value
9     0.075881                    room_type
17    0.000000             security_deposit
29    0.000000             instant_bookable
27    0.000000       review_scores_location
26    0.000000  review_scores_communication
25    0.000000        review_scores_checkin
24    0.000000    review_scores_cleanliness
23    0.000000       review_scores_accuracy
21    0.000000               minimum_nights
19    0.000000              guests_included
18    0.000000                 cleaning_fee
0     0.000000                   host_since
16    0.000000                 weekly_price
1     0.000000                host_location
14    0.000000                     bed_type
13    0.000000                         beds
12    0.000000                     bedrooms
11    0.000000                    bathrooms
10    0.000000                 accommodates
8     0.000000                property_type
7     0.000000       host_identity_verified
6     0.000000         host_has_profile_pic
4     0.000000            host_is_superhost
3     0.000000           host_response_rate
2     0.000000           host_response_time
30    0.000000          cancellation_policy
No description has been provided for this image

Tree With Even Lower Min¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_2_3 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=30, ccp_alpha = 0)

# fit the model to the training data
dt_prob_2_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=30)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=30)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_2_3, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_less_complexity_prob_2")
Out[ ]:
'decision_tree_less_complexity_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_2_3,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3d9bee0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_2_3.predict(X_train)
y_pred_test = dt_prob_2_3.predict(X_test)

y_prob_train = dt_prob_2_3.predict_proba(X_train)
y_prob_test = dt_prob_2_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.6586
Precision: 0.6505
Recall.  : 0.6895

 -- test set -- 
Accuracy : 0.5933
Precision: 0.5808
Recall.  : 0.6510
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[219 109]
 [130 242]]
[[81 52]
 [70 97]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_2_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                      feature
5     0.225162          host_listings_count
20    0.162890                 extra_people
15    0.107165                        price
0     0.086502                   host_since
22    0.085147         review_scores_rating
18    0.085131                 cleaning_fee
3     0.084799           host_response_rate
28    0.052641          review_scores_value
9     0.051049                    room_type
13    0.030899                         beds
10    0.028615                 accommodates
4     0.000000            host_is_superhost
2     0.000000           host_response_time
29    0.000000             instant_bookable
27    0.000000       review_scores_location
26    0.000000  review_scores_communication
25    0.000000        review_scores_checkin
24    0.000000    review_scores_cleanliness
23    0.000000       review_scores_accuracy
21    0.000000               minimum_nights
17    0.000000             security_deposit
19    0.000000              guests_included
16    0.000000                 weekly_price
1     0.000000                host_location
14    0.000000                     bed_type
12    0.000000                     bedrooms
11    0.000000                    bathrooms
8     0.000000                property_type
7     0.000000       host_identity_verified
6     0.000000         host_has_profile_pic
30    0.000000          cancellation_policy
No description has been provided for this image

Create And Assess Logistic Regression Models¶

Full Logistic¶

In [ ]:
#Fit full logistic regression model to examine the significance of the terms
X = X_train
X = sm.add_constant(X)
y=y_train

logit_full_prob_2 = sm.Logit(y, X).fit()

#Print summary
print(logit_full_prob_2.summary())
Optimization terminated successfully.
         Current function value: 0.656205
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:        availability_90   No. Observations:                  700
Model:                          Logit   Df Residuals:                      669
Method:                           MLE   Df Model:                           30
Date:                Wed, 13 Dec 2023   Pseudo R-squ.:                 0.05329
Time:                        22:46:20   Log-Likelihood:                -459.34
converged:                       True   LL-Null:                       -485.20
Covariance Type:            nonrobust   LLR p-value:                  0.008177
===============================================================================================
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                         -14.5786   4.55e+15   -3.2e-15      1.000   -8.92e+15    8.92e+15
host_since                      0.0122      0.063      0.194      0.846      -0.111       0.135
host_location                   0.3165      0.254      1.247      0.212      -0.181       0.814
host_response_time             -0.0875      0.123     -0.713      0.476      -0.328       0.153
host_response_rate              0.0153      0.008      1.814      0.070      -0.001       0.032
host_is_superhost              -0.2970      0.224     -1.327      0.184      -0.736       0.142
host_listings_count            -0.0011      0.002     -0.713      0.476      -0.004       0.002
host_has_profile_pic          -14.5786   4.55e+15   -3.2e-15      1.000   -8.92e+15    8.92e+15
host_identity_verified         -0.3349      0.203     -1.646      0.100      -0.734       0.064
property_type                  -0.0347      0.021     -1.621      0.105      -0.077       0.007
room_type                      -0.5357      0.220     -2.430      0.015      -0.968      -0.104
accommodates                   -0.0652      0.060     -1.091      0.275      -0.182       0.052
bathrooms                      -0.0465      0.192     -0.242      0.809      -0.423       0.330
bedrooms                        0.2715      0.150      1.807      0.071      -0.023       0.566
beds                           -0.1236      0.098     -1.266      0.206      -0.315       0.068
bed_type                       -0.1606      0.172     -0.932      0.351      -0.498       0.177
price                           0.0023      0.002      0.925      0.355      -0.003       0.007
weekly_price                   -0.0004      0.000     -0.993      0.320      -0.001       0.000
security_deposit            -3.572e-06      0.000     -0.014      0.989      -0.001       0.001
cleaning_fee                   -0.0005      0.002     -0.221      0.825      -0.005       0.004
guests_included                -0.0863      0.062     -1.403      0.161      -0.207       0.034
extra_people                   -0.0024      0.003     -0.771      0.441      -0.009       0.004
minimum_nights                  0.0299      0.029      1.038      0.299      -0.026       0.086
review_scores_rating           -0.0162      0.029     -0.565      0.572      -0.073       0.040
review_scores_accuracy          0.1096      0.309      0.355      0.723      -0.496       0.715
review_scores_cleanliness       0.1237      0.127      0.974      0.330      -0.125       0.373
review_scores_checkin          -0.1337        nan        nan        nan         nan         nan
review_scores_communication     0.4864        nan        nan        nan         nan         nan
review_scores_location          0.0066      0.091      0.073      0.942      -0.172       0.185
review_scores_value             0.0583      0.143      0.409      0.683      -0.221       0.338
instant_bookable                0.1178      0.262      0.449      0.653      -0.396       0.632
cancellation_policy            -0.0068      0.069     -0.098      0.922      -0.142       0.128
===============================================================================================
In [ ]:
# Generate predicted values for training set
pprob = logit_full_prob_2.predict(X)

# Create predicted category for success using 50% cutoff
psuccess = (pprob > 0.5).astype(int)

# Add new variables to the training data set
X_train['p_success'] = psuccess
X_train['p_prob'] = pprob
X_train['y'] = y_train

X_train.info()


# Generate predicted values for test set

X_test = sm.add_constant(X_test)

pprob_test = logit_full_prob_2.predict(X_test)

# Create predicted category for success using 50% cutoff
psuccess_test = (pprob_test > 0.5).astype(int)

# Add new variables to the response data set
X_test['p_success'] = psuccess_test
X_test['p_prob'] = pprob_test

X_test.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 700 entries, (0, 57) to (1, 2090)
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   host_since                   700 non-null    int64   
 1   host_location                700 non-null    category
 2   host_response_time           700 non-null    int64   
 3   host_response_rate           700 non-null    float64 
 4   host_is_superhost            700 non-null    category
 5   host_listings_count          700 non-null    float64 
 6   host_has_profile_pic         700 non-null    category
 7   host_identity_verified       700 non-null    category
 8   property_type                700 non-null    int64   
 9   room_type                    700 non-null    int64   
 10  accommodates                 700 non-null    int64   
 11  bathrooms                    700 non-null    float64 
 12  bedrooms                     700 non-null    float64 
 13  beds                         700 non-null    float64 
 14  bed_type                     700 non-null    int64   
 15  price                        700 non-null    float64 
 16  weekly_price                 700 non-null    float64 
 17  security_deposit             700 non-null    float64 
 18  cleaning_fee                 700 non-null    float64 
 19  guests_included              700 non-null    int64   
 20  extra_people                 700 non-null    float64 
 21  minimum_nights               700 non-null    int64   
 22  review_scores_rating         700 non-null    float64 
 23  review_scores_accuracy       700 non-null    float64 
 24  review_scores_cleanliness    700 non-null    float64 
 25  review_scores_checkin        700 non-null    float64 
 26  review_scores_communication  700 non-null    float64 
 27  review_scores_location       700 non-null    float64 
 28  review_scores_value          700 non-null    float64 
 29  instant_bookable             700 non-null    category
 30  cancellation_policy          700 non-null    int64   
 31  p_success                    700 non-null    int64   
 32  p_prob                       700 non-null    float64 
 33  y                            700 non-null    category
dtypes: category(6), float64(18), int64(10)
memory usage: 200.4 KB
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 300 entries, (0, 5415) to (0, 1953)
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   const                        300 non-null    float64 
 1   host_since                   300 non-null    int64   
 2   host_location                300 non-null    category
 3   host_response_time           300 non-null    int64   
 4   host_response_rate           300 non-null    float64 
 5   host_is_superhost            300 non-null    category
 6   host_listings_count          300 non-null    float64 
 7   host_has_profile_pic         300 non-null    category
 8   host_identity_verified       300 non-null    category
 9   property_type                300 non-null    int64   
 10  room_type                    300 non-null    int64   
 11  accommodates                 300 non-null    int64   
 12  bathrooms                    300 non-null    float64 
 13  bedrooms                     300 non-null    float64 
 14  beds                         300 non-null    float64 
 15  bed_type                     300 non-null    int64   
 16  price                        300 non-null    float64 
 17  weekly_price                 300 non-null    float64 
 18  security_deposit             300 non-null    float64 
 19  cleaning_fee                 300 non-null    float64 
 20  guests_included              300 non-null    int64   
 21  extra_people                 300 non-null    float64 
 22  minimum_nights               300 non-null    int64   
 23  review_scores_rating         300 non-null    float64 
 24  review_scores_accuracy       300 non-null    float64 
 25  review_scores_cleanliness    300 non-null    float64 
 26  review_scores_checkin        300 non-null    float64 
 27  review_scores_communication  300 non-null    float64 
 28  review_scores_location       300 non-null    float64 
 29  review_scores_value          300 non-null    float64 
 30  instant_bookable             300 non-null    category
 31  cancellation_policy          300 non-null    int64   
 32  p_success                    300 non-null    int64   
 33  p_prob                       300 non-null    float64 
dtypes: category(5), float64(19), int64(10)
memory usage: 111.3 KB
In [ ]:
# Generate confusion matrix for training set
conf_matrix = confusion_matrix(psuccess, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(psuccess_test, y_test)
print(conf_matrix)
[[209 127]
 [140 224]]
[[93 59]
 [58 90]]
In [ ]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, pprob)
roc_auc = roc_auc_score(y_train.cat.codes, pprob)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, pprob_test)
roc_auc = roc_auc_score(y_test.cat.codes, pprob_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, psuccess)
prec_train = precision_score(y_train, psuccess)
rec_train = recall_score(y_train, psuccess)

# print the scores for the training set

print("Accuracy (Train) : {:.4f}".format(acc_train))
print("Precision (Train): {:.4f}".format(prec_train))
print("Recall (Train)  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, psuccess_test)
prec_test = precision_score(y_test, psuccess_test)
rec_test = recall_score(y_test, psuccess_test)

# print the scores for the testing set

print("Accuracy (Test) : {:.4f}".format(acc_test))
print("Precision (Test): {:.4f}".format(prec_test))
print("Recall (Test)  : {:.4f}".format(rec_test))
print("")
Accuracy (Train) : 0.6186
Precision (Train): 0.6154
Recall (Train)  : 0.6382

Accuracy (Test) : 0.6100
Precision (Test): 0.6081
Recall (Test)  : 0.6040

Create And Assess The LASSO and Ridge Regression Models¶

Fix Training And Testing Data¶
In [ ]:
#Fix the training and testing data by removing the columns that were
#added by the full logistic regression model
X.drop('const', axis=1, inplace=True)
X_train.drop(['p_success', 'p_prob', 'y'], axis=1, inplace=True)
X_test.drop(['const', 'p_success', 'p_prob'], axis=1, inplace=True)

Create¶

In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection  using c = 0.1 and c = 0.01

lr_l1_1_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
lr_l1_01_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)

# fit the models to the training data
lr_l1_1_prob_2.fit(X_train, y_train)
lr_l1_01_prob_2.fit(X_train, y_train)

# Create an Instance of Logistic Regression for LASSO Selection  using c = 1 and c = 0.7
lr_l1_10_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lr_l1_7_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.7)

# fit the models to the training data
lr_l1_10_prob_2.fit(X_train, y_train)
lr_l1_7_prob_2.fit(X_train, y_train)

# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2_prob_2 = LogisticRegression(penalty='l2', solver='liblinear')

# fit the models to the training data
lr_l2_prob_2.fit(X_train, y_train)
Out[ ]:
LogisticRegression(solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(solver='liblinear')

Analyze The Importance Of Different Categories In The Models¶

In [ ]:
# function for model coefficients
def rpt_model_variables(model):
    # Get the intercept term
    intercept = model.intercept_

    # Access the coefficients (weights) of the model, i rounded them
    coefficients = np.round(model.coef_[0],decimals=4)

    # Create DataFrames for intercept and coefficients
    #df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
    df_coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': coefficients})
    df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
    df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)

    # if you want to add intercept to table
    #df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)

    # Print the DataFrame
    print(df_coefficients)

    return df_coefficients

#Evalute the model coefficients for the models
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1_prob_2)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01_prob_2)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10_prob_2)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7_prob_2)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2_prob_2)
Lasso C=0.1
                        feature  coefficient  abs_coefficient
9                     room_type      -0.1566           0.1566
13                         beds      -0.0629           0.0629
7        host_identity_verified      -0.0564           0.0564
19              guests_included      -0.0482           0.0482
8                 property_type      -0.0395           0.0395
10                 accommodates      -0.0223           0.0223
21               minimum_nights       0.0177           0.0177
12                     bedrooms       0.0167           0.0167
3            host_response_rate       0.0126           0.0126
22         review_scores_rating       0.0061           0.0061
20                 extra_people      -0.0036           0.0036
15                        price       0.0025           0.0025
5           host_listings_count      -0.0010           0.0010
2            host_response_time      -0.0009           0.0009
0                    host_since      -0.0006           0.0006
16                 weekly_price      -0.0004           0.0004
1                 host_location       0.0000           0.0000
17             security_deposit       0.0000           0.0000
18                 cleaning_fee       0.0000           0.0000
14                     bed_type       0.0000           0.0000
11                    bathrooms       0.0000           0.0000
6          host_has_profile_pic       0.0000           0.0000
4             host_is_superhost       0.0000           0.0000
23       review_scores_accuracy       0.0000           0.0000
24    review_scores_cleanliness       0.0000           0.0000
25        review_scores_checkin       0.0000           0.0000
26  review_scores_communication       0.0000           0.0000
27       review_scores_location       0.0000           0.0000
28          review_scores_value       0.0000           0.0000
29             instant_bookable       0.0000           0.0000
30          cancellation_policy       0.0000           0.0000

Lasso C=0.01
                        feature  coefficient  abs_coefficient
8                 property_type      -0.0158           0.0158
3            host_response_rate       0.0077           0.0077
20                 extra_people      -0.0043           0.0043
15                        price       0.0017           0.0017
18                 cleaning_fee      -0.0015           0.0015
5           host_listings_count      -0.0006           0.0006
16                 weekly_price      -0.0003           0.0003
0                    host_since      -0.0002           0.0002
17             security_deposit       0.0001           0.0001
7        host_identity_verified       0.0000           0.0000
2            host_response_time       0.0000           0.0000
29             instant_bookable       0.0000           0.0000
28          review_scores_value       0.0000           0.0000
27       review_scores_location       0.0000           0.0000
26  review_scores_communication       0.0000           0.0000
25        review_scores_checkin       0.0000           0.0000
24    review_scores_cleanliness       0.0000           0.0000
23       review_scores_accuracy       0.0000           0.0000
22         review_scores_rating       0.0000           0.0000
21               minimum_nights       0.0000           0.0000
19              guests_included       0.0000           0.0000
9                     room_type       0.0000           0.0000
4             host_is_superhost       0.0000           0.0000
6          host_has_profile_pic       0.0000           0.0000
1                 host_location       0.0000           0.0000
14                     bed_type       0.0000           0.0000
13                         beds       0.0000           0.0000
12                     bedrooms       0.0000           0.0000
11                    bathrooms       0.0000           0.0000
10                 accommodates       0.0000           0.0000
30          cancellation_policy       0.0000           0.0000

Lasso C=1
                        feature  coefficient  abs_coefficient
9                     room_type      -0.4856           0.4856
26  review_scores_communication       0.3969           0.3969
7        host_identity_verified      -0.3062           0.3062
4             host_is_superhost      -0.2645           0.2645
12                     bedrooms       0.2355           0.2355
1                 host_location       0.2241           0.2241
13                         beds      -0.1213           0.1213
14                     bed_type      -0.1122           0.1122
24    review_scores_cleanliness       0.0991           0.0991
19              guests_included      -0.0808           0.0808
2            host_response_time      -0.0699           0.0699
23       review_scores_accuracy       0.0691           0.0691
25        review_scores_checkin      -0.0622           0.0622
10                 accommodates      -0.0579           0.0579
28          review_scores_value       0.0471           0.0471
29             instant_bookable       0.0390           0.0390
8                 property_type      -0.0360           0.0360
21               minimum_nights       0.0270           0.0270
11                    bathrooms      -0.0243           0.0243
3            host_response_rate       0.0150           0.0150
22         review_scores_rating      -0.0114           0.0114
30          cancellation_policy      -0.0098           0.0098
20                 extra_people      -0.0025           0.0025
15                        price       0.0022           0.0022
0                    host_since      -0.0021           0.0021
5           host_listings_count      -0.0010           0.0010
18                 cleaning_fee      -0.0005           0.0005
16                 weekly_price      -0.0003           0.0003
17             security_deposit      -0.0000           0.0000
27       review_scores_location       0.0000           0.0000
6          host_has_profile_pic       0.0000           0.0000

Lasso C=0.7
                        feature  coefficient  abs_coefficient
9                     room_type      -0.4652           0.4652
26  review_scores_communication       0.3591           0.3591
7        host_identity_verified      -0.2932           0.2932
4             host_is_superhost      -0.2486           0.2486
12                     bedrooms       0.2200           0.2200
1                 host_location       0.1872           0.1872
13                         beds      -0.1207           0.1207
14                     bed_type      -0.0912           0.0912
24    review_scores_cleanliness       0.0892           0.0892
19              guests_included      -0.0782           0.0782
2            host_response_time      -0.0629           0.0629
10                 accommodates      -0.0545           0.0545
23       review_scores_accuracy       0.0522           0.0522
28          review_scores_value       0.0410           0.0410
8                 property_type      -0.0363           0.0363
25        review_scores_checkin      -0.0318           0.0318
21               minimum_nights       0.0262           0.0262
11                    bathrooms      -0.0150           0.0150
3            host_response_rate       0.0149           0.0149
30          cancellation_policy      -0.0100           0.0100
22         review_scores_rating      -0.0095           0.0095
29             instant_bookable       0.0043           0.0043
20                 extra_people      -0.0026           0.0026
15                        price       0.0023           0.0023
0                    host_since      -0.0021           0.0021
5           host_listings_count      -0.0010           0.0010
18                 cleaning_fee      -0.0005           0.0005
16                 weekly_price      -0.0003           0.0003
17             security_deposit      -0.0000           0.0000
27       review_scores_location       0.0000           0.0000
6          host_has_profile_pic       0.0000           0.0000

Ridge Regression
                        feature  coefficient  abs_coefficient
9                     room_type      -0.5083           0.5083
26  review_scores_communication       0.4477           0.4477
7        host_identity_verified      -0.3260           0.3260
4             host_is_superhost      -0.2901           0.2901
1                 host_location       0.2870           0.2870
12                     bedrooms       0.2671           0.2671
14                     bed_type      -0.1475           0.1475
13                         beds      -0.1237           0.1237
25        review_scores_checkin      -0.1192           0.1192
24    review_scores_cleanliness       0.1168           0.1168
29             instant_bookable       0.1120           0.1120
23       review_scores_accuracy       0.1037           0.1037
2            host_response_time      -0.0859           0.0859
19              guests_included      -0.0858           0.0858
10                 accommodates      -0.0627           0.0627
28          review_scores_value       0.0591           0.0591
11                    bathrooms      -0.0489           0.0489
8                 property_type      -0.0358           0.0358
21               minimum_nights       0.0291           0.0291
3            host_response_rate       0.0154           0.0154
22         review_scores_rating      -0.0153           0.0153
30          cancellation_policy      -0.0103           0.0103
27       review_scores_location       0.0080           0.0080
20                 extra_people      -0.0025           0.0025
0                    host_since      -0.0022           0.0022
15                        price       0.0022           0.0022
5           host_listings_count      -0.0011           0.0011
6          host_has_profile_pic      -0.0006           0.0006
18                 cleaning_fee      -0.0005           0.0005
16                 weekly_price      -0.0003           0.0003
17             security_deposit      -0.0000           0.0000
In [ ]:
# plot variable importance

# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
  # determine the variables the model is using and create df
  # of their absolute coefficients
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  # determine the variables the model is not using
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  # bar graph of the absolute coefficients that the model is using
  plt.figure(figsize=(5, 10))
  plt.title('Variable Importance')
  plt.xlabel('Coefficient')
  plt.ylabel('Feature')
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  # print the variables the model is not using after the bar graph
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")

# plot the variable importance for the models
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Lasso C=0.1
No description has been provided for this image
-- rejected --
 host_location
 security_deposit
 cleaning_fee
 bed_type
 bathrooms
 host_has_profile_pic
 host_is_superhost
 review_scores_accuracy
 review_scores_cleanliness
 review_scores_checkin
 review_scores_communication
 review_scores_location
 review_scores_value
 instant_bookable
 cancellation_policy

Lasso C=0.01
No description has been provided for this image
-- rejected --
 host_identity_verified
 host_response_time
 instant_bookable
 review_scores_value
 review_scores_location
 review_scores_communication
 review_scores_checkin
 review_scores_cleanliness
 review_scores_accuracy
 review_scores_rating
 minimum_nights
 guests_included
 room_type
 host_is_superhost
 host_has_profile_pic
 host_location
 bed_type
 beds
 bedrooms
 bathrooms
 accommodates
 cancellation_policy

Lasso C=1
No description has been provided for this image
-- rejected --
 security_deposit
 review_scores_location
 host_has_profile_pic

Lasso C=0.7
No description has been provided for this image
-- rejected --
 security_deposit
 review_scores_location
 host_has_profile_pic

Ridge Regression
No description has been provided for this image
-- rejected --
 security_deposit

Make Predictions To Evaluate The Models¶

In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models

#Lasso C=0.1
y_pred_train = lr_l1_1_prob_2.predict(X_train)
y_pred_test = lr_l1_1_prob_2.predict(X_test)
y_proba_train = lr_l1_1_prob_2.predict_proba(X_train)
y_proba_test = lr_l1_1_prob_2.predict_proba(X_test)

#Lasso C=0.01
y_pred_train1 = lr_l1_01_prob_2.predict(X_train)
y_pred_test1 = lr_l1_01_prob_2.predict(X_test)
y_proba_train1 = lr_l1_01_prob_2.predict_proba(X_train)
y_proba_test1 = lr_l1_01_prob_2.predict_proba(X_test)

#Lasso C=1
y_pred_train10 = lr_l1_10_prob_2.predict(X_train)
y_pred_test10 = lr_l1_10_prob_2.predict(X_test)
y_proba_train10 = lr_l1_10_prob_2.predict_proba(X_train)
y_proba_test10 = lr_l1_10_prob_2.predict_proba(X_test)

#Lasso C=0.7
y_pred_train7 = lr_l1_7_prob_2.predict(X_train)
y_pred_test7 = lr_l1_7_prob_2.predict(X_test)
y_proba_train7 = lr_l1_7_prob_2.predict_proba(X_train)
y_proba_test7 = lr_l1_7_prob_2.predict_proba(X_test)

#Ridge Regression
y_pred_train2 = lr_l2_prob_2.predict(X_train)
y_pred_test2 = lr_l2_prob_2.predict(X_test)
y_proba_train2 = lr_l2_prob_2.predict_proba(X_train)
y_proba_test2 = lr_l2_prob_2.predict_proba(X_test)

Evaluate The Models¶

L1 with c=0.1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train = accuracy_score(y_train, y_pred_train)
prec2_train = precision_score(y_train, y_pred_train)
rec2_train = recall_score(y_train, y_pred_train)
auc2_train = roc_auc_score(y_train, y_proba_train[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train))
print("Precision: {:.4f}".format(prec2_train))
print("Recall.  : {:.4f}".format(rec2_train))
print("AUC      : {:.4f}".format(auc2_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test = accuracy_score(y_test, y_pred_test)
prec2_test = precision_score(y_test, y_pred_test)
rec2_test = recall_score(y_test, y_pred_test)
auc2_test = roc_auc_score(y_test, y_proba_test[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test))
print("Precision: {:.4f}".format(prec2_test))
print("Recall.  : {:.4f}".format(rec2_test))
print("AUC      : {:.4f}".format(auc2_test))
 -- train set -- 
Accuracy : 0.5743
Precision: 0.5714
Recall.  : 0.6040
AUC      : 0.6181

 -- test set -- 
Accuracy : 0.5667
Precision: 0.5590
Recall.  : 0.6040
AUC      : 0.6036
L1 with c=0.01¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train1 = accuracy_score(y_train, y_pred_train1)
prec2_train1 = precision_score(y_train, y_pred_train1)
rec2_train1 = recall_score(y_train, y_pred_train1)
auc2_train1 = roc_auc_score(y_train, y_proba_train1[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train1))
print("Precision: {:.4f}".format(prec2_train1))
print("Recall.  : {:.4f}".format(rec2_train1))
print("AUC      : {:.4f}".format(auc2_train1))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test1 = accuracy_score(y_test, y_pred_test1)
prec2_test1 = precision_score(y_test, y_pred_test1)
rec2_test1 = recall_score(y_test, y_pred_test1)
auc2_test1 = roc_auc_score(y_test, y_proba_test1[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test1))
print("Precision: {:.4f}".format(prec2_test1))
print("Recall.  : {:.4f}".format(rec2_test1))
print("AUC      : {:.4f}".format(auc2_test1))
 -- train set -- 
Accuracy : 0.5471
Precision: 0.5390
Recall.  : 0.6695
AUC      : 0.5952

 -- test set -- 
Accuracy : 0.5433
Precision: 0.5337
Recall.  : 0.6376
AUC      : 0.5717
L1 with C=1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train10 = accuracy_score(y_train, y_pred_train10)
prec2_train10 = precision_score(y_train, y_pred_train10)
rec2_train10 = recall_score(y_train, y_pred_train10)
auc2_train10 = roc_auc_score(y_train, y_proba_train10[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train10))
print("Precision: {:.4f}".format(prec2_train10))
print("Recall.  : {:.4f}".format(rec2_train10))
print("AUC      : {:.4f}".format(auc2_train10))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test10 = accuracy_score(y_test, y_pred_test10)
prec2_test10 = precision_score(y_test, y_pred_test10)
rec2_test10 = recall_score(y_test, y_pred_test10)
auc2_test10 = roc_auc_score(y_test, y_proba_test10[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test10))
print("Precision: {:.4f}".format(prec2_test10))
print("Recall.  : {:.4f}".format(rec2_test10))
print("AUC      : {:.4f}".format(auc2_test10))
 -- train set -- 
Accuracy : 0.6143
Precision: 0.6116
Recall.  : 0.6325
AUC      : 0.6523

 -- test set -- 
Accuracy : 0.6000
Precision: 0.5973
Recall.  : 0.5973
AUC      : 0.6206
L1 with C=0.7¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train7 = accuracy_score(y_train, y_pred_train7)
prec2_train7 = precision_score(y_train, y_pred_train7)
rec2_train7 = recall_score(y_train, y_pred_train7)
auc2_train7 = roc_auc_score(y_train, y_proba_train7[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train7))
print("Precision: {:.4f}".format(prec2_train7))
print("Recall.  : {:.4f}".format(rec2_train7))
print("AUC      : {:.4f}".format(auc2_train7))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test7 = accuracy_score(y_test, y_pred_test7)
prec2_test7 = precision_score(y_test, y_pred_test7)
rec2_test7 = recall_score(y_test, y_pred_test7)
auc2_test7 = roc_auc_score(y_test, y_proba_test7[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test7))
print("Precision: {:.4f}".format(prec2_test7))
print("Recall.  : {:.4f}".format(rec2_test7))
print("AUC      : {:.4f}".format(auc2_test7))
 -- train set -- 
Accuracy : 0.6143
Precision: 0.6122
Recall.  : 0.6296
AUC      : 0.6519

 -- test set -- 
Accuracy : 0.5900
Precision: 0.5878
Recall.  : 0.5839
AUC      : 0.6223
L2 Regularization¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train2 = accuracy_score(y_train, y_pred_train2)
prec2_train2 = precision_score(y_train, y_pred_train2)
rec2_train2 = recall_score(y_train, y_pred_train2)
auc2_train2 = roc_auc_score(y_train, y_proba_train2[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train2))
print("Precision: {:.4f}".format(prec2_train2))
print("Recall.  : {:.4f}".format(rec2_train2))
print("AUC      : {:.4f}".format(auc2_train2))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test2 = accuracy_score(y_test, y_pred_test2)
prec2_test2 = precision_score(y_test, y_pred_test2)
rec2_test2 = recall_score(y_test, y_pred_test2)
auc2_test2 = roc_auc_score(y_test, y_proba_test2[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test2))
print("Precision: {:.4f}".format(prec2_test2))
print("Recall.  : {:.4f}".format(rec2_test2))
print("AUC      : {:.4f}".format(auc2_test2))
 -- train set -- 
Accuracy : 0.6171
Precision: 0.6150
Recall.  : 0.6325
AUC      : 0.6534

 -- test set -- 
Accuracy : 0.6033
Precision: 0.6000
Recall.  : 0.6040
AUC      : 0.6186

Problem 3¶

Linear Regression¶

In [ ]:
df_prob_3 = df.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1)
df_prob_3 = df_prob_3[df_prob_3['review_scores_rating'] >= 80]
In [ ]:
# Display summary statistics
print(df_prob_3.describe())

# Compute correlations
cormat = df_prob_3.corr()

# Round correlation matrix to 2 decimal places
cormat = cormat.round(2)

# Plot correlation matrix using a heatmap
plt.figure(figsize = (20,16))
sns.heatmap(cormat,annot=True, cmap = 'coolwarm')
plt.show()
        host_since  host_response_time  host_response_rate  \
count  3320.000000         3320.000000         3320.000000   
mean   2013.023494            2.281024           94.146988   
std       1.466526            0.799745           12.624938   
min    2008.000000            0.000000           13.000000   
25%    2012.000000            2.000000           93.000000   
50%    2013.000000            2.000000          100.000000   
75%    2014.000000            3.000000          100.000000   
max    2015.000000            3.000000          100.000000   

       host_listings_count  property_type    room_type  accommodates  \
count          3320.000000    3320.000000  3320.000000   3320.000000   
mean             15.385241       6.401506     0.310241      4.437952   
std              63.969021       4.235170     0.502015      2.812790   
min               1.000000       0.000000     0.000000      1.000000   
25%               1.000000       0.000000     0.000000      2.000000   
50%               1.000000       9.000000     0.000000      4.000000   
75%               2.000000       9.000000     1.000000      6.000000   
max             339.000000      17.000000     2.000000     16.000000   

         bathrooms     bedrooms         beds  ...  guests_included  \
count  3320.000000  3320.000000  3320.000000  ...      3320.000000   
mean      1.426054     1.696988     2.260843  ...         2.068373   
std       0.715703     1.153635     1.740650  ...         1.865101   
min       0.000000     0.000000     1.000000  ...         0.000000   
25%       1.000000     1.000000     1.000000  ...         1.000000   
50%       1.000000     1.000000     2.000000  ...         1.000000   
75%       2.000000     2.000000     3.000000  ...         2.000000   
max       7.000000    10.000000    16.000000  ...        16.000000   

       extra_people  minimum_nights  availability_30  availability_60  \
count   3320.000000     3320.000000      3320.000000      3320.000000   
mean      14.991566        1.965361        16.756928        37.440964   
std       28.168219        2.688678        10.891157        20.918877   
min        0.000000        1.000000         0.000000         0.000000   
25%        0.000000        1.000000         6.750000        21.000000   
50%        0.000000        2.000000        19.000000        44.000000   
75%       25.000000        2.000000        28.000000        57.000000   
max      500.000000       60.000000        30.000000        60.000000   

       availability_90  availability_365  number_of_reviews  \
count      3320.000000       3320.000000        3320.000000   
mean         59.978916        273.629819          18.436446   
std          30.595223        114.416397          31.446002   
min           0.000000          0.000000           1.000000   
25%          39.000000        218.750000           3.000000   
50%          71.000000        332.000000           7.000000   
75%          87.000000        359.000000          20.000000   
max          90.000000        365.000000         314.000000   

       review_scores_rating  cancellation_policy  
count           3320.000000          3320.000000  
mean              96.025301             1.739759  
std                5.117595             1.295826  
min               80.000000             0.000000  
25%               94.000000             1.000000  
50%               98.000000             1.000000  
75%              100.000000             3.000000  
max              100.000000             4.000000  

[8 rows x 25 columns]
<ipython-input-974-bbed5e760c05>:5: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  cormat = df_prob_3.corr()
No description has been provided for this image
In [ ]:
# Select only numerical columns
numerical_columns = df_prob_3.select_dtypes(include=[np.number])

# Display the numerical variables
numerical_columns.head()
Out[ ]:
host_since host_response_time host_response_rate host_listings_count property_type room_type accommodates bathrooms bedrooms beds ... guests_included extra_people minimum_nights availability_30 availability_60 availability_90 availability_365 number_of_reviews review_scores_rating cancellation_policy
7 2014 3 100.0 1.0 9 1 2 1.0 1.0 1.0 ... 2 10.0 1 16 16 37 312 2 100.0 1
9 2012 3 100.0 1.0 9 1 2 1.0 1.0 1.0 ... 2 19.0 1 29 59 89 364 20 99.0 1
11 2011 3 100.0 1.0 9 1 2 1.0 1.0 1.0 ... 1 10.0 3 24 54 84 84 9 93.0 3
12 2013 3 97.0 9.0 9 1 2 1.0 1.0 1.0 ... 1 0.0 2 25 55 85 360 4 100.0 3
13 2013 3 97.0 9.0 9 1 2 1.0 1.0 1.0 ... 1 20.0 3 29 59 89 364 7 100.0 3

5 rows × 25 columns

Initial Model

In [ ]:
# Perform linear regression
numerical_columns = df_prob_3.select_dtypes(include=[np.number])
X = numerical_columns.drop(columns=['review_scores_rating'])
y = numerical_columns['review_scores_rating']
X = sm.add_constant(X)
reg1_prob_3 = sm.OLS(y, X).fit()

# Display regression summary
print(reg1_prob_3.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     review_scores_rating   R-squared:                       0.052
Model:                              OLS   Adj. R-squared:                  0.045
Method:                   Least Squares   F-statistic:                     7.571
Date:                  Wed, 13 Dec 2023   Prob (F-statistic):           2.24e-25
Time:                          22:46:26   Log-Likelihood:                -10042.
No. Observations:                  3320   AIC:                         2.013e+04
Df Residuals:                      3295   BIC:                         2.029e+04
Df Model:                            24                                         
Covariance Type:              nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  73.3358    129.518      0.566      0.571    -180.609     327.280
host_since              0.0101      0.064      0.157      0.875      -0.116       0.136
host_response_time     -0.0911      0.133     -0.685      0.493      -0.352       0.170
host_response_rate      0.0326      0.008      3.889      0.000       0.016       0.049
host_listings_count    -0.0133      0.002     -8.416      0.000      -0.016      -0.010
property_type           0.0979      0.022      4.351      0.000       0.054       0.142
room_type              -0.7683      0.221     -3.478      0.001      -1.201      -0.335
accommodates           -0.1693      0.067     -2.521      0.012      -0.301      -0.038
bathrooms               0.3326      0.204      1.634      0.102      -0.066       0.732
bedrooms               -0.0307      0.156     -0.197      0.844      -0.336       0.274
beds                   -0.0482      0.096     -0.500      0.617      -0.237       0.141
bed_type               -0.1196      0.169     -0.710      0.478      -0.450       0.211
price                  -0.0009      0.002     -0.446      0.656      -0.005       0.003
weekly_price            0.0003      0.000      0.970      0.332      -0.000       0.001
security_deposit        0.0003      0.000      1.029      0.304      -0.000       0.001
cleaning_fee            0.0044      0.002      1.827      0.068      -0.000       0.009
guests_included         0.0786      0.063      1.248      0.212      -0.045       0.202
extra_people           -0.0001      0.004     -0.032      0.974      -0.007       0.007
minimum_nights          0.0247      0.033      0.749      0.454      -0.040       0.089
availability_30         0.0307      0.024      1.280      0.201      -0.016       0.078
availability_60        -0.0409      0.022     -1.822      0.069      -0.085       0.003
availability_90         0.0129      0.012      1.112      0.266      -0.010       0.036
availability_365       -0.0004      0.001     -0.349      0.727      -0.002       0.002
number_of_reviews    5.566e-07      0.003      0.000      1.000      -0.006       0.006
cancellation_policy    -0.1007      0.077     -1.308      0.191      -0.252       0.050
==============================================================================
Omnibus:                      910.589   Durbin-Watson:                   1.981
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2064.947
Skew:                          -1.556   Prob(JB):                         0.00
Kurtosis:                       5.290   Cond. No.                     4.02e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.02e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
vifres = pd.DataFrame()
vifres["Variable"] = X.columns
vifres["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifres)
               Variable           VIF
0                 const  2.227561e+06
1            host_since  1.181043e+00
2    host_response_time  1.501849e+00
3    host_response_rate  1.486876e+00
4   host_listings_count  1.366618e+00
5         property_type  1.204378e+00
6             room_type  1.632806e+00
7          accommodates  4.735134e+00
8             bathrooms  2.816620e+00
9              bedrooms  4.274533e+00
10                 beds  3.740522e+00
11             bed_type  1.113456e+00
12                price  2.972996e+01
13         weekly_price  2.878216e+01
14     security_deposit  1.575646e+00
15         cleaning_fee  2.371857e+00
16      guests_included  1.829933e+00
17         extra_people  1.333160e+00
18       minimum_nights  1.043525e+00
19      availability_30  9.065570e+00
20      availability_60  2.921048e+01
21      availability_90  1.671801e+01
22     availability_365  1.855702e+00
23    number_of_reviews  1.182531e+00
24  cancellation_policy  1.320730e+00
In [ ]:
def stepwise_selection(X, y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out = 0.05,
                       verbose=True):

    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print(f'Add  {best_feature} with p-value {best_pval:.4f}')
        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print(f'Drop {worst_feature} with p-value {worst_pval:.4f}')
        if not changed:
           break
    return included
In [ ]:
selected_features  = stepwise_selection(X, y)
print('resulting features:')
print(selected_features)
Add  const with p-value 0.0000
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
Add  host_listings_count with p-value 0.0000
Add  cleaning_fee with p-value 0.0000
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
Add  host_response_rate with p-value 0.0000
Add  property_type with p-value 0.0001
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
Add  room_type with p-value 0.0026
resulting features:
['const', 'host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  new_pval = pd.Series(index=excluded)
In [ ]:
# Fit stepwise regression
X = numerical_columns[['host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']]
X = sm.add_constant(X)
stepreg_prob_3 = sm.OLS(y, X).fit()

# Display regression summary
print(stepreg_prob_3.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     review_scores_rating   R-squared:                       0.043
Model:                              OLS   Adj. R-squared:                  0.041
Method:                   Least Squares   F-statistic:                     29.58
Date:                  Wed, 13 Dec 2023   Prob (F-statistic):           1.73e-29
Time:                          22:46:28   Log-Likelihood:                -10058.
No. Observations:                  3320   AIC:                         2.013e+04
Df Residuals:                      3314   BIC:                         2.017e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  92.6527      0.668    138.745      0.000      91.343      93.962
host_listings_count    -0.0144      0.001    -10.017      0.000      -0.017      -0.012
cleaning_fee            0.0050      0.002      2.687      0.007       0.001       0.009
host_response_rate      0.0313      0.007      4.515      0.000       0.018       0.045
property_type           0.0896      0.021      4.240      0.000       0.048       0.131
room_type              -0.5831      0.194     -3.010      0.003      -0.963      -0.203
==============================================================================
Omnibus:                      888.450   Durbin-Watson:                   1.981
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1957.894
Skew:                          -1.535   Prob(JB):                         0.00
Kurtosis:                       5.173   Cond. No.                         879.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
vifresstep = pd.DataFrame()
vifresstep["Variable"] = X.columns
vifresstep["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifresstep)
              Variable        VIF
0                const  58.965066
1  host_listings_count   1.119284
2         cleaning_fee   1.415161
3   host_response_rate   1.012429
4        property_type   1.059833
5            room_type   1.250513
In [ ]:
# Calculate residuals
residuals = stepreg_prob_3.resid

# Generate Q-Q Plot
fig = sm.qqplot(residuals,fit=True, line='45')
plt.show()


# Residuals vs. Fitted
plt.figure(figsize=(10, 6))
plt.scatter(stepreg_prob_3.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# log transformed regression
X = numerical_columns[['host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']]
X = sm.add_constant(X)
logy = np.log(y)
logreg_prob_3 = sm.OLS(logy, X).fit()

# Display regression summary
print(logreg_prob_3.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     review_scores_rating   R-squared:                       0.043
Model:                              OLS   Adj. R-squared:                  0.041
Method:                   Least Squares   F-statistic:                     29.48
Date:                  Wed, 13 Dec 2023   Prob (F-statistic):           2.22e-29
Time:                          22:46:29   Log-Likelihood:                 4936.7
No. Observations:                  3320   AIC:                            -9861.
Df Residuals:                      3314   BIC:                            -9825.
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   4.5264      0.007    620.361      0.000       4.512       4.541
host_listings_count    -0.0002   1.57e-05    -10.002      0.000      -0.000      -0.000
cleaning_fee         5.229e-05   2.05e-05      2.555      0.011    1.22e-05    9.24e-05
host_response_rate      0.0003   7.57e-05      4.515      0.000       0.000       0.000
property_type           0.0010      0.000      4.258      0.000       0.001       0.001
room_type              -0.0065      0.002     -3.047      0.002      -0.011      -0.002
==============================================================================
Omnibus:                     1012.580   Durbin-Watson:                   1.982
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2560.565
Skew:                          -1.670   Prob(JB):                         0.00
Kurtosis:                       5.711   Cond. No.                         879.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Calculate residuals
residuals = logreg_prob_3.resid

# Generate Q-Q Plot
fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()


# Residuals vs. Fitted
plt.figure(figsize=(10, 6))
plt.scatter(logreg_prob_3.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# square root transformed regression
X = numerical_columns[['host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']]
X = sm.add_constant(X)
sqrty = np.sqrt(y)
sqrtreg_prob_3 = sm.OLS(logy, X).fit()

# Display regression summary
print(sqrtreg_prob_3.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     review_scores_rating   R-squared:                       0.043
Model:                              OLS   Adj. R-squared:                  0.041
Method:                   Least Squares   F-statistic:                     29.48
Date:                  Wed, 13 Dec 2023   Prob (F-statistic):           2.22e-29
Time:                          22:46:29   Log-Likelihood:                 4936.7
No. Observations:                  3320   AIC:                            -9861.
Df Residuals:                      3314   BIC:                            -9825.
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   4.5264      0.007    620.361      0.000       4.512       4.541
host_listings_count    -0.0002   1.57e-05    -10.002      0.000      -0.000      -0.000
cleaning_fee         5.229e-05   2.05e-05      2.555      0.011    1.22e-05    9.24e-05
host_response_rate      0.0003   7.57e-05      4.515      0.000       0.000       0.000
property_type           0.0010      0.000      4.258      0.000       0.001       0.001
room_type              -0.0065      0.002     -3.047      0.002      -0.011      -0.002
==============================================================================
Omnibus:                     1012.580   Durbin-Watson:                   1.982
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2560.565
Skew:                          -1.670   Prob(JB):                         0.00
Kurtosis:                       5.711   Cond. No.                         879.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Calculate residuals
residuals = sqrtreg_prob_3.resid

# Generate Q-Q Plot
fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()


# Residuals vs. Fitted
plt.figure(figsize=(10, 6))
plt.scatter(sqrtreg_prob_3.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image

Classifiers ("5" Catgories)¶

In [ ]:
def categorize_review_scores(score):
  if score < 20:
    return 0
  elif score >= 20 and score < 40:
    return 1
  elif score >= 40 and score < 60:
    return 2
  elif score >= 60 and score < 80:
    return 3
  else:
    return 4
In [ ]:
df_prob_3_2 = df.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1)
df_prob_3_2['review_scores_rating'] = df_prob_3_2['review_scores_rating'].apply(lambda x: categorize_review_scores(x)).astype('category')
df_prob_3_2['review_scores_rating'].head(60)
Out[ ]:
7      4
9      4
11     4
12     4
13     4
14     4
16     4
19     4
23     4
26     4
27     4
28     4
29     3
34     4
35     4
37     4
39     4
40     4
44     4
46     4
50     4
51     4
53     4
56     4
57     4
58     4
60     4
62     4
63     4
65     4
66     4
67     4
68     4
71     4
72     4
75     4
76     4
77     4
83     4
85     4
86     4
87     4
88     4
89     4
93     4
94     4
96     4
100    4
101    4
102    4
103    4
104    4
107    4
108    4
110    4
111    4
113    4
114    4
115    4
116    4
Name: review_scores_rating, dtype: category
Categories (4, int64): [1, 2, 3, 4]
In [ ]:
#No reviews are below 20
df_prob_3_2[df_prob_3_2['review_scores_rating'] == 0]
Out[ ]:
host_since host_location host_response_time host_response_rate host_is_superhost host_listings_count host_has_profile_pic host_identity_verified property_type room_type ... extra_people minimum_nights availability_30 availability_60 availability_90 availability_365 number_of_reviews review_scores_rating instant_bookable cancellation_policy

0 rows × 30 columns

In [ ]:
df_prob_3_2.groupby('review_scores_rating').count()
Out[ ]:
host_since host_location host_response_time host_response_rate host_is_superhost host_listings_count host_has_profile_pic host_identity_verified property_type room_type ... guests_included extra_people minimum_nights availability_30 availability_60 availability_90 availability_365 number_of_reviews instant_bookable cancellation_policy
review_scores_rating
1 5 5 5 5 5 5 5 5 5 5 ... 5 5 5 5 5 5 5 5 5 5
2 4 4 4 4 4 4 4 4 4 4 ... 4 4 4 4 4 4 4 4 4 4
3 43 43 43 43 43 43 43 43 43 43 ... 43 43 43 43 43 43 43 43 43 43
4 3320 3320 3320 3320 3320 3320 3320 3320 3320 3320 ... 3320 3320 3320 3320 3320 3320 3320 3320 3320 3320

4 rows × 29 columns

In [ ]:
df_prob_3_sampled = df_prob_3_2.groupby('review_scores_rating').apply(lambda s: s.sample(min(len(s), 30)))
In [ ]:
#Split the data on what the model is learning to predict, whether an AirBnB will be booked
X = df_prob_3_sampled.drop('review_scores_rating', axis=1)

y = df_prob_3_sampled['review_scores_rating']

#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48 entries, (2, 2696) to (3, 3665)
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   host_since              48 non-null     int64   
 1   host_location           48 non-null     category
 2   host_response_time      48 non-null     int64   
 3   host_response_rate      48 non-null     float64 
 4   host_is_superhost       48 non-null     category
 5   host_listings_count     48 non-null     float64 
 6   host_has_profile_pic    48 non-null     category
 7   host_identity_verified  48 non-null     category
 8   property_type           48 non-null     int64   
 9   room_type               48 non-null     int64   
 10  accommodates            48 non-null     int64   
 11  bathrooms               48 non-null     float64 
 12  bedrooms                48 non-null     float64 
 13  beds                    48 non-null     float64 
 14  bed_type                48 non-null     int64   
 15  price                   48 non-null     float64 
 16  weekly_price            48 non-null     float64 
 17  security_deposit        48 non-null     float64 
 18  cleaning_fee            48 non-null     float64 
 19  guests_included         48 non-null     int64   
 20  extra_people            48 non-null     float64 
 21  minimum_nights          48 non-null     int64   
 22  availability_30         48 non-null     int64   
 23  availability_60         48 non-null     int64   
 24  availability_90         48 non-null     int64   
 25  availability_365        48 non-null     int64   
 26  number_of_reviews       48 non-null     int64   
 27  instant_bookable        48 non-null     category
 28  cancellation_policy     48 non-null     int64   
dtypes: category(5), float64(10), int64(14)
memory usage: 12.9 KB
<class 'pandas.core.series.Series'>
MultiIndex: 48 entries, (2, 2696) to (3, 3665)
Series name: review_scores_rating
Non-Null Count  Dtype   
--------------  -----   
48 non-null     category
dtypes: category(1)
memory usage: 3.3 KB

Create And Assess Decision Tree Classifiers¶

Default Tree¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_3 = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)

# fit the model to the training data
dt_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['1','2','3','4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_prob_3")
Out[ ]:
'decision_tree_default_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3,
                'tree.dot',
                class_names=['1','2','3','4'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c4041d50>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3.predict(X_train)
y_pred_test = dt_prob_3.predict(X_test)

y_prob_train = dt_prob_3.predict_proba(X_train)
y_prob_test = dt_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6458
Precision: ['0.0000', '0.0000', '0.6061', '0.7333']
Recall: ['0.0000', '0.0000', '0.8333', '0.6471']

 -- test set -- 
Accuracy : 0.5238
Precision: ['0.0000', '0.0000', '0.3000', '0.7273']
Recall: ['0.0000', '0.0000', '0.5000', '0.6154']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0  0  0  0]
 [ 0  0  0  0]
 [ 4  3 20  6]
 [ 0  0  4 11]]
[[0 0 0 0]
 [0 0 0 0]
 [1 1 3 5]
 [0 0 3 8]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.80
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
0.70
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
26    0.808137       number_of_reviews
23    0.191863         availability_60
0     0.000000              host_since
15    0.000000                   price
27    0.000000        instant_bookable
25    0.000000        availability_365
24    0.000000         availability_90
22    0.000000         availability_30
21    0.000000          minimum_nights
20    0.000000            extra_people
19    0.000000         guests_included
18    0.000000            cleaning_fee
17    0.000000        security_deposit
16    0.000000            weekly_price
14    0.000000                bed_type
1     0.000000           host_location
13    0.000000                    beds
12    0.000000                bedrooms
11    0.000000               bathrooms
10    0.000000            accommodates
9     0.000000               room_type
8     0.000000           property_type
7     0.000000  host_identity_verified
6     0.000000    host_has_profile_pic
5     0.000000     host_listings_count
4     0.000000       host_is_superhost
3     0.000000      host_response_rate
2     0.000000      host_response_time
28    0.000000     cancellation_policy
No description has been provided for this image

Tuned Tree¶

In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune_prob_3 = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune_prob_3, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
print(best_estimator)
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 3 members, which is less than n_splits=5.
  warnings.warn(
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 10}
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=10)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values

dt_tuned_prob_3 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=10, ccp_alpha = 0)

# fit the model to the training data
dt_tuned_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['1', '2', '3', '4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_prob_3")
Out[ ]:
'decision_tree_tuned_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_prob_3,
                'tree.dot',
                class_names=['1', '2', '3', '4'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3a1fe80>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_prob_3.predict(X_train)
y_pred_test = dt_tuned_prob_3.predict(X_test)

y_prob_train = dt_tuned_prob_3.predict_proba(X_train)
y_prob_test = dt_tuned_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6458
Precision: ['0.0000', '0.0000', '0.6061', '0.7333']
Recall: ['0.0000', '0.0000', '0.8333', '0.6471']

 -- test set -- 
Accuracy : 0.5238
Precision: ['0.0000', '0.0000', '0.3000', '0.7273']
Recall: ['0.0000', '0.0000', '0.5000', '0.6154']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0  0  0  0]
 [ 0  0  0  0]
 [ 4  3 20  6]
 [ 0  0  4 11]]
[[0 0 0 0]
 [0 0 0 0]
 [1 1 3 5]
 [0 0 3 8]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.80
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
0.70
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_prob_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
26    0.808137       number_of_reviews
23    0.191863         availability_60
0     0.000000              host_since
15    0.000000                   price
27    0.000000        instant_bookable
25    0.000000        availability_365
24    0.000000         availability_90
22    0.000000         availability_30
21    0.000000          minimum_nights
20    0.000000            extra_people
19    0.000000         guests_included
18    0.000000            cleaning_fee
17    0.000000        security_deposit
16    0.000000            weekly_price
14    0.000000                bed_type
1     0.000000           host_location
13    0.000000                    beds
12    0.000000                bedrooms
11    0.000000               bathrooms
10    0.000000            accommodates
9     0.000000               room_type
8     0.000000           property_type
7     0.000000  host_identity_verified
6     0.000000    host_has_profile_pic
5     0.000000     host_listings_count
4     0.000000       host_is_superhost
3     0.000000      host_response_rate
2     0.000000      host_response_time
28    0.000000     cancellation_policy
No description has been provided for this image

Tree With No Min¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_3_2 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=1, ccp_alpha = 0)

# fit the model to the training data
dt_prob_3_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_2, filled=True, rounded=True, feature_names=X.columns, class_names=['1', '2', '3', '4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_no_min_prob_3")
Out[ ]:
'decision_tree_no_min_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_2,
                'tree.dot',
                class_names=['1','2', '3', '4'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c288e0b0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_2.predict(X_train)
y_pred_test = dt_prob_3_2.predict(X_test)

y_prob_train = dt_prob_3_2.predict_proba(X_train)
y_prob_test = dt_prob_3_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 1.0000
Precision: ['1.0000', '1.0000', '1.0000', '1.0000']
Recall: ['1.0000', '1.0000', '1.0000', '1.0000']

 -- test set -- 
Accuracy : 0.4286
Precision: ['0.0000', '0.0000', '0.2000', '0.7000']
Recall: ['0.0000', '0.0000', '0.3333', '0.5385']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 4  0  0  0]
 [ 0  3  0  0]
 [ 0  0 24  0]
 [ 0  0  0 17]]
[[0 0 1 0]
 [0 0 0 0]
 [1 1 2 6]
 [0 0 3 7]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.62
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
0.49
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_2.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
26    0.209155       number_of_reviews
25    0.113154        availability_365
0     0.108305              host_since
5     0.086512     host_listings_count
11    0.062132               bathrooms
8     0.061720           property_type
7     0.061669  host_identity_verified
27    0.060280        instant_bookable
19    0.050919         guests_included
2     0.050368      host_response_time
18    0.045262            cleaning_fee
10    0.045262            accommodates
22    0.045262         availability_30
24    0.000000         availability_90
23    0.000000         availability_60
21    0.000000          minimum_nights
20    0.000000            extra_people
14    0.000000                bed_type
17    0.000000        security_deposit
16    0.000000            weekly_price
15    0.000000                   price
1     0.000000           host_location
13    0.000000                    beds
12    0.000000                bedrooms
9     0.000000               room_type
6     0.000000    host_has_profile_pic
4     0.000000       host_is_superhost
3     0.000000      host_response_rate
28    0.000000     cancellation_policy
No description has been provided for this image

Tree With No Min, Less Depth¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_3_3 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf=1, ccp_alpha = 0)

# fit the model to the training data
dt_prob_3_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_3, filled=True, rounded=True, feature_names=X.columns, class_names=['1','2','3','4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_no_min_less_depth_prob_3")
Out[ ]:
'decision_tree_no_min_less_depth_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_3,
                'tree.dot',
                class_names=['1','2','3','4'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3a1c3a0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_3.predict(X_train)
y_pred_test = dt_prob_3_3.predict(X_test)

y_prob_train = dt_prob_3_3.predict_proba(X_train)
y_prob_test = dt_prob_3_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 1.0000
Precision: ['1.0000', '1.0000', '1.0000', '1.0000']
Recall: ['1.0000', '1.0000', '1.0000', '1.0000']

 -- test set -- 
Accuracy : 0.3333
Precision: ['0.0000', '0.0000', '0.1818', '0.6250']
Recall: ['0.0000', '0.0000', '0.3333', '0.3846']
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 4  0  0  0]
 [ 0  3  0  0]
 [ 0  0 24  0]
 [ 0  0  0 17]]
[[0 0 1 0]
 [0 0 0 1]
 [1 1 2 7]
 [0 0 3 5]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.56
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
0.46
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
26    0.254417       number_of_reviews
27    0.122001        instant_bookable
25    0.090523        availability_365
5     0.086512     host_listings_count
13    0.067893                    beds
0     0.063043              host_since
11    0.062132               bathrooms
7     0.061669  host_identity_verified
20    0.050919            extra_people
2     0.050368      host_response_time
15    0.045262                   price
10    0.045262            accommodates
21    0.000000          minimum_nights
23    0.000000         availability_60
22    0.000000         availability_30
18    0.000000            cleaning_fee
24    0.000000         availability_90
19    0.000000         guests_included
14    0.000000                bed_type
17    0.000000        security_deposit
16    0.000000            weekly_price
1     0.000000           host_location
12    0.000000                bedrooms
9     0.000000               room_type
8     0.000000           property_type
6     0.000000    host_has_profile_pic
4     0.000000       host_is_superhost
3     0.000000      host_response_rate
28    0.000000     cancellation_policy
No description has been provided for this image

Create And Assess Logistic Regression Models¶

Full Logistic¶

In [ ]:
# define the multinomial logistic regression model
logistic_model_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# fit the model on the training data
logistic_model_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')

Create The LASSO and Ridge Regression Models¶

In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection  using c = 0.1 and c = 0.01

lr_l1_1_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.1)
lr_l1_01_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.01)

# fit the models to the training data
lr_l1_1_prob_3.fit(X_train, y_train)
lr_l1_01_prob_3.fit(X_train, y_train)

# Create an Instance of Logistic Regression for LASSO Selection  using c = 1 and c = 0.7
lr_l1_10_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=1)
lr_l1_7_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.7)

# fit the models to the training data
lr_l1_10_prob_3.fit(X_train, y_train)
lr_l1_7_prob_3.fit(X_train, y_train)

# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2')

# fit the models to the training data
lr_l2_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')

Analyze The Importance Of Different Categories In The Models¶

In [ ]:
# function for model coefficients
def rpt_model_variables(model):
    # Get the intercept term
    intercept = model.intercept_

    # Access the coefficients (weights) of the model, i rounded them
    coefficients = np.round(model.coef_[0],decimals=4)

    # Create DataFrames for intercept and coefficients
    #df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
    df_coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': coefficients})
    df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
    df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)

    # if you want to add intercept to table
    #df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)

    # Print the DataFrame
    print(df_coefficients)

    return df_coefficients

#Evalute the model coefficients for the models
print("Full Logistic Regression Model")
df_coefficients_full = rpt_model_variables(logistic_model_prob_3)
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1_prob_3)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01_prob_3)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10_prob_3)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7_prob_3)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2_prob_3)
Full Logistic Regression Model
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0642           0.0642
5      host_listings_count       0.0251           0.0251
25        availability_365      -0.0205           0.0205
3       host_response_rate       0.0128           0.0128
23         availability_60       0.0113           0.0113
15                   price      -0.0108           0.0108
26       number_of_reviews      -0.0093           0.0093
20            extra_people       0.0088           0.0088
24         availability_90       0.0087           0.0087
8            property_type      -0.0061           0.0061
0               host_since       0.0026           0.0026
10            accommodates       0.0014           0.0014
19         guests_included       0.0013           0.0013
27        instant_bookable       0.0012           0.0012
16            weekly_price       0.0011           0.0011
2       host_response_time       0.0010           0.0010
12                bedrooms       0.0007           0.0007
9                room_type       0.0007           0.0007
13                    beds      -0.0006           0.0006
14                bed_type       0.0005           0.0005
18            cleaning_fee      -0.0004           0.0004
21          minimum_nights       0.0004           0.0004
28     cancellation_policy       0.0004           0.0004
11               bathrooms       0.0003           0.0003
7   host_identity_verified       0.0003           0.0003
22         availability_30      -0.0002           0.0002
4        host_is_superhost      -0.0002           0.0002
1            host_location       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
Lasso C=0.1
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0016           0.0016
5      host_listings_count       0.0011           0.0011
0               host_since      -0.0005           0.0005
15                   price      -0.0002           0.0002
16            weekly_price       0.0002           0.0002
25        availability_365      -0.0001           0.0001
18            cleaning_fee       0.0001           0.0001
27        instant_bookable       0.0000           0.0000
26       number_of_reviews      -0.0000           0.0000
24         availability_90       0.0000           0.0000
23         availability_60       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
20            extra_people       0.0000           0.0000
19         guests_included       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type      -0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
3       host_response_rate       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Lasso C=0.01
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0013           0.0013
5      host_listings_count       0.0008           0.0008
0               host_since      -0.0005           0.0005
16            weekly_price       0.0002           0.0002
15                   price       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
26       number_of_reviews       0.0000           0.0000
25        availability_365       0.0000           0.0000
24         availability_90       0.0000           0.0000
23         availability_60       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
20            extra_people       0.0000           0.0000
19         guests_included       0.0000           0.0000
18            cleaning_fee       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type       0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
3       host_response_rate       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Lasso C=1
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0017           0.0017
5      host_listings_count       0.0012           0.0012
0               host_since      -0.0005           0.0005
16            weekly_price       0.0002           0.0002
18            cleaning_fee       0.0002           0.0002
15                   price      -0.0002           0.0002
3       host_response_rate       0.0001           0.0001
26       number_of_reviews      -0.0001           0.0001
25        availability_365      -0.0001           0.0001
23         availability_60       0.0001           0.0001
27        instant_bookable       0.0000           0.0000
24         availability_90       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
20            extra_people       0.0000           0.0000
19         guests_included       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type      -0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Lasso C=0.7
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0016           0.0016
5      host_listings_count       0.0012           0.0012
0               host_since      -0.0005           0.0005
16            weekly_price       0.0002           0.0002
18            cleaning_fee       0.0002           0.0002
15                   price      -0.0002           0.0002
3       host_response_rate       0.0001           0.0001
26       number_of_reviews      -0.0001           0.0001
25        availability_365      -0.0001           0.0001
23         availability_60       0.0001           0.0001
27        instant_bookable       0.0000           0.0000
24         availability_90       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
20            extra_people       0.0000           0.0000
19         guests_included       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type      -0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Ridge Regression
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0642           0.0642
5      host_listings_count       0.0251           0.0251
25        availability_365      -0.0205           0.0205
3       host_response_rate       0.0128           0.0128
23         availability_60       0.0113           0.0113
15                   price      -0.0108           0.0108
26       number_of_reviews      -0.0093           0.0093
20            extra_people       0.0088           0.0088
24         availability_90       0.0087           0.0087
8            property_type      -0.0061           0.0061
0               host_since       0.0026           0.0026
10            accommodates       0.0014           0.0014
19         guests_included       0.0013           0.0013
27        instant_bookable       0.0012           0.0012
16            weekly_price       0.0011           0.0011
2       host_response_time       0.0010           0.0010
12                bedrooms       0.0007           0.0007
9                room_type       0.0007           0.0007
13                    beds      -0.0006           0.0006
14                bed_type       0.0005           0.0005
18            cleaning_fee      -0.0004           0.0004
21          minimum_nights       0.0004           0.0004
28     cancellation_policy       0.0004           0.0004
11               bathrooms       0.0003           0.0003
7   host_identity_verified       0.0003           0.0003
22         availability_30      -0.0002           0.0002
4        host_is_superhost      -0.0002           0.0002
1            host_location       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
In [ ]:
# plot variable importance

# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
  # determine the variables the model is using and create df
  # of their absolute coefficients
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  # determine the variables the model is not using
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  # bar graph of the absolute coefficients that the model is using
  plt.figure(figsize=(5, 10))
  plt.title('Variable Importance')
  plt.xlabel('Coefficient')
  plt.ylabel('Feature')
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  # print the variables the model is not using after the bar graph
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")

# plot the variable importance for the models
print("Full Logistic Regression Model")
plot_variable_imp(df_coefficients_full)
print("")
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Full Logistic Regression Model
No description has been provided for this image
-- rejected --
 host_location
 host_has_profile_pic

Lasso C=0.1
No description has been provided for this image
-- rejected --
 instant_bookable
 number_of_reviews
 availability_90
 availability_60
 availability_30
 minimum_nights
 extra_people
 guests_included
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_rate
 host_response_time
 cancellation_policy

Lasso C=0.01
No description has been provided for this image
-- rejected --
 price
 instant_bookable
 number_of_reviews
 availability_365
 availability_90
 availability_60
 availability_30
 minimum_nights
 extra_people
 guests_included
 cleaning_fee
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_rate
 host_response_time
 cancellation_policy

Lasso C=1
No description has been provided for this image
-- rejected --
 instant_bookable
 availability_90
 availability_30
 minimum_nights
 extra_people
 guests_included
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_time
 cancellation_policy

Lasso C=0.7
No description has been provided for this image
-- rejected --
 instant_bookable
 availability_90
 availability_30
 minimum_nights
 extra_people
 guests_included
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_time
 cancellation_policy

Ridge Regression
No description has been provided for this image
-- rejected --
 host_location
 host_has_profile_pic

Make Predictions To Evaluate The Models¶

In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models

#Full Regression
y_pred_train_full = logistic_model_prob_3.predict(X_train)
y_pred_test_full = logistic_model_prob_3.predict(X_test)
y_proba_train_full = logistic_model_prob_3.predict_proba(X_train)
y_proba_test_full = logistic_model_prob_3.predict_proba(X_test)

#Lasso C=0.1
y_pred_train = lr_l1_1_prob_3.predict(X_train)
y_pred_test = lr_l1_1_prob_3.predict(X_test)
y_proba_train = lr_l1_1_prob_3.predict_proba(X_train)
y_proba_test = lr_l1_1_prob_3.predict_proba(X_test)

#Lasso C=0.01
y_pred_train1 = lr_l1_01_prob_3.predict(X_train)
y_pred_test1 = lr_l1_01_prob_3.predict(X_test)
y_proba_train1 = lr_l1_01_prob_3.predict_proba(X_train)
y_proba_test1 = lr_l1_01_prob_3.predict_proba(X_test)

#Lasso C=1
y_pred_train10 = lr_l1_10_prob_3.predict(X_train)
y_pred_test10 = lr_l1_10_prob_3.predict(X_test)
y_proba_train10 = lr_l1_10_prob_3.predict_proba(X_train)
y_proba_test10 = lr_l1_10_prob_3.predict_proba(X_test)

#Lasso C=0.7
y_pred_train7 = lr_l1_7_prob_3.predict(X_train)
y_pred_test7 = lr_l1_7_prob_3.predict(X_test)
y_proba_train7 = lr_l1_7_prob_3.predict_proba(X_train)
y_proba_test7 = lr_l1_7_prob_3.predict_proba(X_test)

#Ridge Regression
y_pred_train2 = lr_l2_prob_3.predict(X_train)
y_pred_test2 = lr_l2_prob_3.predict(X_test)
y_proba_train2 = lr_l2_prob_3.predict_proba(X_train)
y_proba_test2 = lr_l2_prob_3.predict_proba(X_test)

Evaluate The Models¶

Full Model¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train_full)
prec_train = precision_score(y_train, y_pred_train_full, average=None)
rec_train = recall_score(y_train, y_pred_train_full, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test_full)
prec_test = precision_score(y_test, y_pred_test_full, average=None)
rec_test = recall_score(y_test, y_pred_test_full, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6667
Precision: ['0.3333', '0.5000', '0.6552', '0.7857']
Recall: ['0.2500', '0.3333', '0.7917', '0.6471']

 -- test set -- 
Accuracy : 0.5238
Precision: ['0.0000', '0.0000', '0.5000', '0.8571']
Recall: ['0.0000', '0.0000', '0.8333', '0.4615']
L1 with c=0.1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5208
Precision: ['0.5000', '0.0000', '0.5238', '0.5000']
Recall: ['0.2500', '0.0000', '0.9167', '0.1176']

 -- test set -- 
Accuracy : 0.3333
Precision: ['0.0000', '0.0000', '0.3333', '0.5000']
Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with c=0.01¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train1)
prec_train = precision_score(y_train, y_pred_train1, average=None)
rec_train = recall_score(y_train, y_pred_train1, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test1)
prec_test = precision_score(y_test, y_pred_test1, average=None)
rec_test = recall_score(y_test, y_pred_test1, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5208
Precision: ['0.5000', '0.0000', '0.5227', '0.5000']
Recall: ['0.2500', '0.0000', '0.9583', '0.0588']

 -- test set -- 
Accuracy : 0.3333
Precision: ['0.0000', '0.0000', '0.3333', '0.5000']
Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with C=1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train10)
prec_train = precision_score(y_train, y_pred_train10, average=None)
rec_train = recall_score(y_train, y_pred_train10, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test10)
prec_test = precision_score(y_test, y_pred_test10, average=None)
rec_test = recall_score(y_test, y_pred_test10, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5208
Precision: ['0.5000', '0.0000', '0.5238', '0.5000']
Recall: ['0.2500', '0.0000', '0.9167', '0.1176']

 -- test set -- 
Accuracy : 0.3333
Precision: ['0.0000', '0.0000', '0.3333', '0.5000']
Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with C=0.7¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train7)
prec_train = precision_score(y_train, y_pred_train7, average=None)
rec_train = recall_score(y_train, y_pred_train7, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test7)
prec_test = precision_score(y_test, y_pred_test7, average=None)
rec_test = recall_score(y_test, y_pred_test7, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5208
Precision: ['0.5000', '0.0000', '0.5238', '0.5000']
Recall: ['0.2500', '0.0000', '0.9167', '0.1176']

 -- test set -- 
Accuracy : 0.3333
Precision: ['0.0000', '0.0000', '0.3333', '0.5000']
Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L2 Regularization¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train2)
prec_train = precision_score(y_train, y_pred_train2, average=None)
rec_train = recall_score(y_train, y_pred_train2, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test2)
prec_test = precision_score(y_test, y_pred_test2, average=None)
rec_test = recall_score(y_test, y_pred_test2, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6667
Precision: ['0.3333', '0.5000', '0.6552', '0.7857']
Recall: ['0.2500', '0.3333', '0.7917', '0.6471']

 -- test set -- 
Accuracy : 0.5238
Precision: ['0.0000', '0.0000', '0.5000', '0.8571']
Recall: ['0.0000', '0.0000', '0.8333', '0.4615']

Classifiers ("10" Catgories)¶

In [ ]:
def categorize_review_scores(score):
  if score < 10:
    return 0
  elif score >= 10 and score < 20:
    return 1
  elif score >= 20 and score < 30:
    return 2
  elif score >= 30 and score < 40:
    return 3
  elif score >= 40 and score < 50:
    return 4
  elif score >= 50 and score < 60:
    return 5
  elif score >= 60 and score < 70:
    return 6
  elif score >= 70 and score < 80:
    return 7
  elif score >= 80 and score < 90:
    return 8
  else:
    return 9
In [ ]:
df_prob_3_2 = df.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1)
df_prob_3_2['review_scores_rating'] = df_prob_3_2['review_scores_rating'].apply(lambda x: categorize_review_scores(x)).astype('category')
df_prob_3_2['review_scores_rating'].head(60)
Out[ ]:
7      9
9      9
11     9
12     9
13     9
14     9
16     8
19     9
23     9
26     9
27     8
28     9
29     6
34     9
35     9
37     8
39     8
40     8
44     9
46     8
50     9
51     8
53     9
56     9
57     9
58     9
60     9
62     9
63     9
65     9
66     9
67     9
68     9
71     9
72     9
75     9
76     9
77     9
83     8
85     9
86     9
87     9
88     9
89     9
93     9
94     9
96     9
100    9
101    9
102    9
103    9
104    8
107    9
108    9
110    9
111    9
113    9
114    9
115    9
116    9
Name: review_scores_rating, dtype: category
Categories (7, int64): [2, 4, 5, 6, 7, 8, 9]
In [ ]:
#No reviews are below 20
print(df_prob_3_2['review_scores_rating'] [df_prob_3_2['review_scores_rating'] == 0])
print(df_prob_3_2['review_scores_rating'][df_prob_3_2['review_scores_rating'] == 1])
Series([], Name: review_scores_rating, dtype: category
Categories (7, int64): [2, 4, 5, 6, 7, 8, 9])
Series([], Name: review_scores_rating, dtype: category
Categories (7, int64): [2, 4, 5, 6, 7, 8, 9])
In [ ]:
#Some review scores between 20 and 30
print(df_prob_3_2['review_scores_rating'][df_prob_3_2['review_scores_rating'] == 2])
955     2
4856    2
4881    2
5151    2
5635    2
Name: review_scores_rating, dtype: category
Categories (7, int64): [2, 4, 5, 6, 7, 8, 9]
In [ ]:
#No review scores between 30 and 40
print(df_prob_3_2['review_scores_rating'][df_prob_3_2['review_scores_rating'] == 3])
Series([], Name: review_scores_rating, dtype: category
Categories (7, int64): [2, 4, 5, 6, 7, 8, 9])
In [ ]:
df_prob_3_2.groupby('review_scores_rating').count()
Out[ ]:
host_since host_location host_response_time host_response_rate host_is_superhost host_listings_count host_has_profile_pic host_identity_verified property_type room_type ... guests_included extra_people minimum_nights availability_30 availability_60 availability_90 availability_365 number_of_reviews instant_bookable cancellation_policy
review_scores_rating
2 5 5 5 5 5 5 5 5 5 5 ... 5 5 5 5 5 5 5 5 5 5
4 1 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1
5 3 3 3 3 3 3 3 3 3 3 ... 3 3 3 3 3 3 3 3 3 3
6 25 25 25 25 25 25 25 25 25 25 ... 25 25 25 25 25 25 25 25 25 25
7 18 18 18 18 18 18 18 18 18 18 ... 18 18 18 18 18 18 18 18 18 18
8 318 318 318 318 318 318 318 318 318 318 ... 318 318 318 318 318 318 318 318 318 318
9 3002 3002 3002 3002 3002 3002 3002 3002 3002 3002 ... 3002 3002 3002 3002 3002 3002 3002 3002 3002 3002

7 rows × 29 columns

In [ ]:
df_prob_3_sampled = df_prob_3_2.groupby('review_scores_rating').apply(lambda s: s.sample(min(len(s), 200)))
In [ ]:
#Split the data on what the model is learning to predict, whether an AirBnB will be booked
X = df_prob_3_sampled.drop('review_scores_rating', axis=1)

y = df_prob_3_sampled['review_scores_rating']

#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 316 entries, (9, 4410) to (8, 2404)
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   host_since              316 non-null    int64   
 1   host_location           316 non-null    category
 2   host_response_time      316 non-null    int64   
 3   host_response_rate      316 non-null    float64 
 4   host_is_superhost       316 non-null    category
 5   host_listings_count     316 non-null    float64 
 6   host_has_profile_pic    316 non-null    category
 7   host_identity_verified  316 non-null    category
 8   property_type           316 non-null    int64   
 9   room_type               316 non-null    int64   
 10  accommodates            316 non-null    int64   
 11  bathrooms               316 non-null    float64 
 12  bedrooms                316 non-null    float64 
 13  beds                    316 non-null    float64 
 14  bed_type                316 non-null    int64   
 15  price                   316 non-null    float64 
 16  weekly_price            316 non-null    float64 
 17  security_deposit        316 non-null    float64 
 18  cleaning_fee            316 non-null    float64 
 19  guests_included         316 non-null    int64   
 20  extra_people            316 non-null    float64 
 21  minimum_nights          316 non-null    int64   
 22  availability_30         316 non-null    int64   
 23  availability_60         316 non-null    int64   
 24  availability_90         316 non-null    int64   
 25  availability_365        316 non-null    int64   
 26  number_of_reviews       316 non-null    int64   
 27  instant_bookable        316 non-null    category
 28  cancellation_policy     316 non-null    int64   
dtypes: category(5), float64(10), int64(14)
memory usage: 82.6 KB
<class 'pandas.core.series.Series'>
MultiIndex: 316 entries, (9, 4410) to (8, 2404)
Series name: review_scores_rating
Non-Null Count  Dtype   
--------------  -----   
316 non-null    category
dtypes: category(1)
memory usage: 21.9 KB

Create And Assess Decision Tree Classifiers¶

Default Tree¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_3 = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)

# fit the model to the training data
dt_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_prob_3")
Out[ ]:
'decision_tree_default_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3,
                'tree.dot',
                class_names=['2','4','5','6','7','8','9'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c250dde0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3.predict(X_train)
y_pred_test = dt_prob_3.predict(X_test)

y_prob_train = dt_prob_3.predict_proba(X_train)
y_prob_test = dt_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.7152
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.7143', '0.7160']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.7971', '0.8056']

 -- test set -- 
Accuracy : 0.4926
Precision: ['0.0000', '0.0000', '0.0000', '0.5000', '0.4861']
Recall: ['0.0000', '0.0000', '0.0000', '0.5161', '0.6250']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  4   0   0   5   7 110  28]
 [  1   1   2   8   6  28 116]]
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 1  7  3 32 21]
 [ 0  5  2 30 35]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.84
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
16    0.233062            weekly_price
4     0.200269       host_is_superhost
17    0.092846        security_deposit
26    0.074683       number_of_reviews
18    0.058502            cleaning_fee
8     0.056258           property_type
24    0.055359         availability_90
23    0.053939         availability_60
0     0.046475              host_since
20    0.045182            extra_people
19    0.037713         guests_included
28    0.030051     cancellation_policy
10    0.015662            accommodates
12    0.000000                bedrooms
21    0.000000          minimum_nights
27    0.000000        instant_bookable
2     0.000000      host_response_time
25    0.000000        availability_365
3     0.000000      host_response_rate
5     0.000000     host_listings_count
22    0.000000         availability_30
6     0.000000    host_has_profile_pic
13    0.000000                    beds
7     0.000000  host_identity_verified
9     0.000000               room_type
11    0.000000               bathrooms
15    0.000000                   price
1     0.000000           host_location
14    0.000000                bed_type
No description has been provided for this image

Tuned Tree¶

In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune_prob_3 = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune_prob_3, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
print(best_estimator)
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
{'ccp_alpha': 0, 'max_depth': 5, 'min_samples_leaf': 20}
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=20)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values

dt_tuned_prob_3 = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=20, ccp_alpha = 0)

# fit the model to the training data
dt_tuned_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=20)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=20)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_prob_3")
Out[ ]:
'decision_tree_tuned_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_prob_3,
                'tree.dot',
                class_names=['2','4','5','6','7','8','9'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c4b359c0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_prob_3.predict(X_train)
y_pred_test = dt_tuned_prob_3.predict(X_test)

y_prob_train = dt_tuned_prob_3.predict_proba(X_train)
y_prob_test = dt_tuned_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6361
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5756', '0.7477']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.8551', '0.5764']

 -- test set -- 
Accuracy : 0.5441
Precision: ['0.0000', '0.0000', '0.0000', '0.5000', '0.6364']
Recall: ['0.0000', '0.0000', '0.0000', '0.7419', '0.5000']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  5   0   1  10  10 118  61]
 [  0   1   1   3   3  20  83]]
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 1 12  5 46 28]
 [ 0  0  0 16 28]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.88
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_prob_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
4     0.334405       host_is_superhost
16    0.288412            weekly_price
17    0.155032        security_deposit
26    0.071443       number_of_reviews
19    0.062972         guests_included
22    0.059257         availability_30
21    0.028479          minimum_nights
0     0.000000              host_since
15    0.000000                   price
27    0.000000        instant_bookable
25    0.000000        availability_365
24    0.000000         availability_90
23    0.000000         availability_60
20    0.000000            extra_people
18    0.000000            cleaning_fee
14    0.000000                bed_type
1     0.000000           host_location
13    0.000000                    beds
12    0.000000                bedrooms
11    0.000000               bathrooms
10    0.000000            accommodates
9     0.000000               room_type
8     0.000000           property_type
7     0.000000  host_identity_verified
6     0.000000    host_has_profile_pic
5     0.000000     host_listings_count
3     0.000000      host_response_rate
2     0.000000      host_response_time
28    0.000000     cancellation_policy
No description has been provided for this image

Less Complex Tree¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_3_2 = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=20, ccp_alpha = 0.01)

# fit the model to the training data
dt_prob_3_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.01, max_depth=5, min_samples_leaf=20)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.01, max_depth=5, min_samples_leaf=20)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_2, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_less_complex_prob_3")
Out[ ]:
'decision_tree_less_complex_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_2,
                'tree.dot',
                class_names=['2','4','5','6','7','8','9'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57ddd4f0d0>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_2.predict(X_train)
y_pred_test = dt_prob_3_2.predict(X_test)

y_prob_train = dt_prob_3_2.predict_proba(X_train)
y_prob_test = dt_prob_3_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6139
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5507', '0.7753']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.9058', '0.4792']

 -- test set -- 
Accuracy : 0.5882
Precision: ['0.0000', '0.0000', '0.0000', '0.5300', '0.7500']
Recall: ['0.0000', '0.0000', '0.0000', '0.8548', '0.4821']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  5   0   1  10  11 125  75]
 [  0   1   1   3   2  13  69]]
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 1 12  5 53 29]
 [ 0  0  0  9 27]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.90
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_2.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
4     0.455216       host_is_superhost
16    0.333744            weekly_price
17    0.211040        security_deposit
0     0.000000              host_since
15    0.000000                   price
27    0.000000        instant_bookable
26    0.000000       number_of_reviews
25    0.000000        availability_365
24    0.000000         availability_90
23    0.000000         availability_60
22    0.000000         availability_30
21    0.000000          minimum_nights
20    0.000000            extra_people
19    0.000000         guests_included
18    0.000000            cleaning_fee
14    0.000000                bed_type
1     0.000000           host_location
13    0.000000                    beds
12    0.000000                bedrooms
11    0.000000               bathrooms
10    0.000000            accommodates
9     0.000000               room_type
8     0.000000           property_type
7     0.000000  host_identity_verified
6     0.000000    host_has_profile_pic
5     0.000000     host_listings_count
3     0.000000      host_response_rate
2     0.000000      host_response_time
28    0.000000     cancellation_policy
No description has been provided for this image

Less Min Tree¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_prob_3_3 = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=10, ccp_alpha = 0)

# fit the model to the training data
dt_prob_3_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_3, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_less_min_prob_3")
Out[ ]:
'decision_tree_less_min_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_3,
                'tree.dot',
                class_names=['2','4','5','6','7','8','9'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c458db40>
No description has been provided for this image
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_3.predict(X_train)
y_pred_test = dt_prob_3_3.predict(X_test)

y_prob_train = dt_prob_3_3.predict_proba(X_train)
y_prob_test = dt_prob_3_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.6614
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6645', '0.6585']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.7319', '0.7500']

 -- test set -- 
Accuracy : 0.5147
Precision: ['0.0000', '0.0000', '0.0000', '0.5000', '0.5312']
Recall: ['0.0000', '0.0000', '0.0000', '0.5806', '0.6071']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  1   0   1   4   9 101  36]
 [  4   1   1   9   4  37 108]]
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 1  9  4 36 22]
 [ 0  3  1 26 34]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score:
0.87
In [ ]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score:
nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_prob_test[:, class_id],
        name=f"ROC curve for {n_names[class_id]}",
        color=color,
        ax=ax,
        #plot_chance_level=(class_id == 2),
    )

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
No description has been provided for this image
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_3.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df_tree['feature']=X.columns

#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)

print(df_tree2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
            y=df_tree2['feature'],
            x=df_tree2['importance'], color="lightblue")

plt.show()
    importance                 feature
4     0.275032       host_is_superhost
16    0.260477            weekly_price
17    0.127506        security_deposit
26    0.102563       number_of_reviews
0     0.063825              host_since
20    0.062049            extra_people
19    0.051791         guests_included
13    0.021509                    beds
23    0.018408         availability_60
18    0.016840            cleaning_fee
27    0.000000        instant_bookable
25    0.000000        availability_365
24    0.000000         availability_90
22    0.000000         availability_30
21    0.000000          minimum_nights
14    0.000000                bed_type
15    0.000000                   price
1     0.000000           host_location
12    0.000000                bedrooms
11    0.000000               bathrooms
10    0.000000            accommodates
9     0.000000               room_type
8     0.000000           property_type
7     0.000000  host_identity_verified
6     0.000000    host_has_profile_pic
5     0.000000     host_listings_count
3     0.000000      host_response_rate
2     0.000000      host_response_time
28    0.000000     cancellation_policy
No description has been provided for this image

Create And Assess Logistic Regression Models¶

Full Logistic¶

In [ ]:
# define the multinomial logistic regression model
logistic_model_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# fit the model on the training data
logistic_model_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')

Create The LASSO and Ridge Regression Models¶

In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection  using c = 0.1 and c = 0.01

lr_l1_1_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.1)
lr_l1_01_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.01)

# fit the models to the training data
lr_l1_1_prob_3.fit(X_train, y_train)
lr_l1_01_prob_3.fit(X_train, y_train)

# Create an Instance of Logistic Regression for LASSO Selection  using c = 1 and c = 0.7
lr_l1_10_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=1)
lr_l1_7_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.7)

# fit the models to the training data
lr_l1_10_prob_3.fit(X_train, y_train)
lr_l1_7_prob_3.fit(X_train, y_train)

# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2')

# fit the models to the training data
lr_l2_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')

Analyze The Importance Of Different Categories In The Models¶

In [ ]:
# function for model coefficients
def rpt_model_variables(model):
    # Get the intercept term
    intercept = model.intercept_

    # Access the coefficients (weights) of the model, i rounded them
    coefficients = np.round(model.coef_[0],decimals=4)

    # Create DataFrames for intercept and coefficients
    #df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
    df_coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': coefficients})
    df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
    df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)

    # if you want to add intercept to table
    #df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)

    # Print the DataFrame
    print(df_coefficients)

    return df_coefficients

#Evalute the model coefficients for the models
print("Full Logistic Regression Model")
df_coefficients_full = rpt_model_variables(logistic_model_prob_3)
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1_prob_3)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01_prob_3)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10_prob_3)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7_prob_3)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2_prob_3)
Full Logistic Regression Model
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0084           0.0084
5      host_listings_count       0.0035           0.0035
25        availability_365      -0.0021           0.0021
26       number_of_reviews      -0.0013           0.0013
3       host_response_rate       0.0007           0.0007
23         availability_60       0.0006           0.0006
15                   price      -0.0004           0.0004
8            property_type      -0.0004           0.0004
20            extra_people      -0.0004           0.0004
24         availability_90       0.0003           0.0003
18            cleaning_fee      -0.0002           0.0002
10            accommodates       0.0001           0.0001
16            weekly_price       0.0001           0.0001
2       host_response_time       0.0001           0.0001
0               host_since      -0.0001           0.0001
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
19         guests_included       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms      -0.0000           0.0000
9                room_type       0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic      -0.0000           0.0000
4        host_is_superhost      -0.0000           0.0000
28     cancellation_policy       0.0000           0.0000
Lasso C=0.1
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0012           0.0012
5      host_listings_count       0.0006           0.0006
0               host_since      -0.0005           0.0005
16            weekly_price       0.0002           0.0002
26       number_of_reviews      -0.0001           0.0001
25        availability_365      -0.0001           0.0001
15                   price       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
24         availability_90       0.0000           0.0000
23         availability_60       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
20            extra_people      -0.0000           0.0000
19         guests_included       0.0000           0.0000
18            cleaning_fee       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type      -0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
3       host_response_rate       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Lasso C=0.01
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0011           0.0011
0               host_since      -0.0005           0.0005
5      host_listings_count       0.0005           0.0005
16            weekly_price       0.0002           0.0002
15                   price       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
26       number_of_reviews       0.0000           0.0000
25        availability_365       0.0000           0.0000
24         availability_90       0.0000           0.0000
23         availability_60       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
20            extra_people       0.0000           0.0000
19         guests_included       0.0000           0.0000
18            cleaning_fee       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type       0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
3       host_response_rate       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Lasso C=1
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0012           0.0012
5      host_listings_count       0.0006           0.0006
0               host_since      -0.0005           0.0005
16            weekly_price       0.0002           0.0002
26       number_of_reviews      -0.0001           0.0001
25        availability_365      -0.0001           0.0001
20            extra_people      -0.0001           0.0001
15                   price       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
24         availability_90       0.0000           0.0000
23         availability_60       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
19         guests_included      -0.0000           0.0000
18            cleaning_fee       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type      -0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
3       host_response_rate       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Lasso C=0.7
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0012           0.0012
5      host_listings_count       0.0006           0.0006
0               host_since      -0.0005           0.0005
16            weekly_price       0.0002           0.0002
26       number_of_reviews      -0.0001           0.0001
25        availability_365      -0.0001           0.0001
20            extra_people      -0.0001           0.0001
15                   price       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
24         availability_90       0.0000           0.0000
23         availability_60       0.0000           0.0000
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
19         guests_included      -0.0000           0.0000
18            cleaning_fee       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms       0.0000           0.0000
10            accommodates       0.0000           0.0000
9                room_type       0.0000           0.0000
8            property_type      -0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic       0.0000           0.0000
4        host_is_superhost       0.0000           0.0000
3       host_response_rate       0.0000           0.0000
2       host_response_time       0.0000           0.0000
28     cancellation_policy       0.0000           0.0000

Ridge Regression
                   feature  coefficient  abs_coefficient
17        security_deposit      -0.0084           0.0084
5      host_listings_count       0.0035           0.0035
25        availability_365      -0.0021           0.0021
26       number_of_reviews      -0.0013           0.0013
3       host_response_rate       0.0007           0.0007
23         availability_60       0.0006           0.0006
15                   price      -0.0004           0.0004
8            property_type      -0.0004           0.0004
20            extra_people      -0.0004           0.0004
24         availability_90       0.0003           0.0003
18            cleaning_fee      -0.0002           0.0002
10            accommodates       0.0001           0.0001
16            weekly_price       0.0001           0.0001
2       host_response_time       0.0001           0.0001
0               host_since      -0.0001           0.0001
22         availability_30       0.0000           0.0000
21          minimum_nights       0.0000           0.0000
19         guests_included       0.0000           0.0000
27        instant_bookable       0.0000           0.0000
14                bed_type       0.0000           0.0000
1            host_location       0.0000           0.0000
13                    beds       0.0000           0.0000
12                bedrooms       0.0000           0.0000
11               bathrooms      -0.0000           0.0000
9                room_type       0.0000           0.0000
7   host_identity_verified       0.0000           0.0000
6     host_has_profile_pic      -0.0000           0.0000
4        host_is_superhost      -0.0000           0.0000
28     cancellation_policy       0.0000           0.0000
In [ ]:
# plot variable importance

# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
  # determine the variables the model is using and create df
  # of their absolute coefficients
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  # determine the variables the model is not using
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  # bar graph of the absolute coefficients that the model is using
  plt.figure(figsize=(5, 10))
  plt.title('Variable Importance')
  plt.xlabel('Coefficient')
  plt.ylabel('Feature')
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  # print the variables the model is not using after the bar graph
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")

# plot the variable importance for the models
print("Full Logistic Regression Model")
plot_variable_imp(df_coefficients_full)
print("")
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Full Logistic Regression Model
No description has been provided for this image
-- rejected --
 availability_30
 minimum_nights
 guests_included
 instant_bookable
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 room_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 cancellation_policy

Lasso C=0.1
No description has been provided for this image
-- rejected --
 price
 instant_bookable
 availability_90
 availability_60
 availability_30
 minimum_nights
 extra_people
 guests_included
 cleaning_fee
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_rate
 host_response_time
 cancellation_policy

Lasso C=0.01
No description has been provided for this image
-- rejected --
 price
 instant_bookable
 number_of_reviews
 availability_365
 availability_90
 availability_60
 availability_30
 minimum_nights
 extra_people
 guests_included
 cleaning_fee
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_rate
 host_response_time
 cancellation_policy

Lasso C=1
No description has been provided for this image
-- rejected --
 price
 instant_bookable
 availability_90
 availability_60
 availability_30
 minimum_nights
 guests_included
 cleaning_fee
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_rate
 host_response_time
 cancellation_policy

Lasso C=0.7
No description has been provided for this image
-- rejected --
 price
 instant_bookable
 availability_90
 availability_60
 availability_30
 minimum_nights
 guests_included
 cleaning_fee
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 accommodates
 room_type
 property_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 host_response_rate
 host_response_time
 cancellation_policy

Ridge Regression
No description has been provided for this image
-- rejected --
 availability_30
 minimum_nights
 guests_included
 instant_bookable
 bed_type
 host_location
 beds
 bedrooms
 bathrooms
 room_type
 host_identity_verified
 host_has_profile_pic
 host_is_superhost
 cancellation_policy

Make Predictions To Evaluate The Models¶

In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models

#Full Regression
y_pred_train_full = logistic_model_prob_3.predict(X_train)
y_pred_test_full = logistic_model_prob_3.predict(X_test)
y_proba_train_full = logistic_model_prob_3.predict_proba(X_train)
y_proba_test_full = logistic_model_prob_3.predict_proba(X_test)

#Lasso C=0.1
y_pred_train = lr_l1_1_prob_3.predict(X_train)
y_pred_test = lr_l1_1_prob_3.predict(X_test)
y_proba_train = lr_l1_1_prob_3.predict_proba(X_train)
y_proba_test = lr_l1_1_prob_3.predict_proba(X_test)

#Lasso C=0.01
y_pred_train1 = lr_l1_01_prob_3.predict(X_train)
y_pred_test1 = lr_l1_01_prob_3.predict(X_test)
y_proba_train1 = lr_l1_01_prob_3.predict_proba(X_train)
y_proba_test1 = lr_l1_01_prob_3.predict_proba(X_test)

#Lasso C=1
y_pred_train10 = lr_l1_10_prob_3.predict(X_train)
y_pred_test10 = lr_l1_10_prob_3.predict(X_test)
y_proba_train10 = lr_l1_10_prob_3.predict_proba(X_train)
y_proba_test10 = lr_l1_10_prob_3.predict_proba(X_test)

#Lasso C=0.7
y_pred_train7 = lr_l1_7_prob_3.predict(X_train)
y_pred_test7 = lr_l1_7_prob_3.predict(X_test)
y_proba_train7 = lr_l1_7_prob_3.predict_proba(X_train)
y_proba_test7 = lr_l1_7_prob_3.predict_proba(X_test)

#Ridge Regression
y_pred_train2 = lr_l2_prob_3.predict(X_train)
y_pred_test2 = lr_l2_prob_3.predict(X_test)
y_proba_train2 = lr_l2_prob_3.predict_proba(X_train)
y_proba_test2 = lr_l2_prob_3.predict_proba(X_test)

Evaluate The Models¶

Full Model¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train_full)
prec_train = precision_score(y_train, y_pred_train_full, average=None)
rec_train = recall_score(y_train, y_pred_train_full, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test_full)
prec_test = precision_score(y_test, y_pred_test_full, average=None)
rec_test = recall_score(y_test, y_pred_test_full, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5854
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5542', '0.6200']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6667', '0.6458']

 -- test set -- 
Accuracy : 0.5147
Precision: ['0.0000', '0.0000', '0.0000', '0.4872', '0.5517']
Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5714']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with c=0.1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5190
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4911', '0.5510']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5625']

 -- test set -- 
Accuracy : 0.5221
Precision: ['0.0000', '0.0000', '0.0000', '0.5205', '0.5238']
Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5893']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with c=0.01¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train1)
prec_train = precision_score(y_train, y_pred_train1, average=None)
rec_train = recall_score(y_train, y_pred_train1, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test1)
prec_test = precision_score(y_test, y_pred_test1, average=None)
rec_test = recall_score(y_test, y_pred_test1, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5222
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4940', '0.5541']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5694']

 -- test set -- 
Accuracy : 0.5368
Precision: ['0.0000', '0.0000', '0.0000', '0.5352', '0.5385']
Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.6250']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with C=1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train10)
prec_train = precision_score(y_train, y_pred_train10, average=None)
rec_train = recall_score(y_train, y_pred_train10, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test10)
prec_test = precision_score(y_test, y_pred_test10, average=None)
rec_test = recall_score(y_test, y_pred_test10, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5190
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4911', '0.5510']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5625']

 -- test set -- 
Accuracy : 0.5221
Precision: ['0.0000', '0.0000', '0.0000', '0.5205', '0.5238']
Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5893']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L1 with C=0.7¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train7)
prec_train = precision_score(y_train, y_pred_train7, average=None)
rec_train = recall_score(y_train, y_pred_train7, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test7)
prec_test = precision_score(y_test, y_pred_test7, average=None)
rec_test = recall_score(y_test, y_pred_test7, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5190
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4911', '0.5510']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5625']

 -- test set -- 
Accuracy : 0.5221
Precision: ['0.0000', '0.0000', '0.0000', '0.5205', '0.5238']
Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5893']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
L2 Regularization¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train2)
prec_train = precision_score(y_train, y_pred_train2, average=None)
rec_train = recall_score(y_train, y_pred_train2, average=None)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test2)
prec_test = precision_score(y_test, y_pred_test2, average=None)
rec_test = recall_score(y_test, y_pred_test2, average=None)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
 -- train set -- 
Accuracy : 0.5854
Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5542', '0.6200']
Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6667', '0.6458']

 -- test set -- 
Accuracy : 0.5147
Precision: ['0.0000', '0.0000', '0.0000', '0.4872', '0.5517']
Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5714']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Exploratory Analysis¶

In [ ]:
df_prob_2
Out[ ]:
host_since host_location host_response_time host_response_rate host_is_superhost host_listings_count host_has_profile_pic host_identity_verified property_type room_type ... availability_90 review_scores_rating review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable cancellation_policy
7 2014 1 3 100.0 0 1.0 1 1 9 1 ... 0 100.0 10.0 10.0 10.0 10.0 10.0 10.0 1 1
9 2012 1 3 100.0 1 1.0 1 1 9 1 ... 0 99.0 10.0 10.0 10.0 10.0 10.0 10.0 0 1
11 2011 1 3 100.0 0 1.0 1 1 9 1 ... 0 93.0 10.0 10.0 10.0 10.0 10.0 10.0 0 3
12 2013 1 3 97.0 1 9.0 1 1 9 1 ... 0 100.0 9.0 10.0 10.0 10.0 10.0 10.0 0 3
13 2013 1 3 97.0 1 9.0 1 1 9 1 ... 0 100.0 10.0 10.0 10.0 10.0 10.0 10.0 0 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5826 2013 1 2 100.0 0 339.0 1 1 0 0 ... 0 85.0 9.0 9.0 9.0 9.0 10.0 8.0 0 3
5827 2011 1 2 87.0 0 19.0 1 1 0 0 ... 0 94.0 9.0 9.0 10.0 9.0 10.0 9.0 0 3
5829 2012 1 2 95.0 0 11.0 1 0 0 0 ... 0 88.0 10.0 10.0 10.0 10.0 10.0 10.0 0 1
5830 2013 1 2 100.0 1 21.0 1 1 0 0 ... 0 100.0 10.0 10.0 10.0 10.0 10.0 9.0 0 3
5832 2014 0 3 100.0 0 5.0 1 0 9 0 ... 0 100.0 8.0 10.0 10.0 10.0 10.0 8.0 1 3

3372 rows × 32 columns

In [ ]:
booked_avg_price = df_prob_2.groupby('availability_90')['price'].mean().reset_index()
booked_avg_price['availability_90'] = booked_avg_price['availability_90'].astype('string')
booked_avg_price['availability_90'][0] = "Non-Booked"
booked_avg_price['availability_90'][1] = "Booked"
booked_avg_price
Out[ ]:
availability_90 price
0 Non-Booked 211.695888
1 Booked 199.032746
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_price,
              x='availability_90', y='price', color='lightblue')
plot.yaxis.set_major_formatter('${x:1.0f}') # Make y-axis currency
#Give the column chart a title
plt.title("Average Price Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+3, # y-coordinate position of data label, padded 3 above bar
 s = '${:.2f}'.format(height), # data label format of currency
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
No description has been provided for this image
In [ ]:
booked_avg_review_score = df_prob_2.groupby('availability_90')['review_scores_rating'].mean().reset_index()
booked_avg_review_score


booked_avg_review_score['availability_90'] = booked_avg_price['availability_90'].astype('string')
booked_avg_review_score['availability_90'][0] = "Non-Booked"
booked_avg_review_score['availability_90'][1] = "Booked"
booked_avg_review_score['review_scores_rating'] = booked_avg_review_score['review_scores_rating'] / 100
booked_avg_review_score
Out[ ]:
availability_90 review_scores_rating
0 Non-Booked 0.953491
1 Booked 0.959433
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_review_score,
              x='availability_90', y='review_scores_rating', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
#Give the column chart a title
plt.title("Average Review Scores Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.01, # y-coordinate position of data label, padded 0.01 above bar
  s = '{:.2%}'.format(height), # data label format of percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
No description has been provided for this image
In [ ]:
booked_avg_min_nights = df_prob_2.groupby('availability_90')['minimum_nights'].mean().reset_index()
booked_avg_min_nights['availability_90'] = booked_avg_min_nights['availability_90'].astype('string')
booked_avg_min_nights['availability_90'][0] = "Non-Booked"
booked_avg_min_nights['availability_90'][1] = "Booked"
booked_avg_min_nights
Out[ ]:
availability_90 minimum_nights
0 Non-Booked 1.911947
1 Booked 2.134761
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_min_nights,
              x='availability_90', y='minimum_nights', color='lightblue')
#Give the column chart a title
plt.title("Average Minimum Nights Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 3 above bar
 s = '{:.2f}'.format(height), # data label format of currency
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
No description has been provided for this image
In [ ]:
booked_min_nights = df_prob_2.groupby('availability_90')['minimum_nights'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_min_nights['availability_90'] = booked_avg_min_nights['availability_90'].astype('string')
booked_min_nights['availability_90'][0] = "Non-Booked"
booked_min_nights['availability_90'][1] = "Booked"
booked_min_nights
Out[ ]:
availability_90 min mean median max
0 Non-Booked 1 1.911947 2.0 45
1 Booked 1 2.134761 2.0 60
In [ ]:
booked_avg_sec_deposit = df_prob_2.groupby('availability_90')['security_deposit'].mean().reset_index()
booked_avg_sec_deposit['availability_90'] = booked_avg_sec_deposit['availability_90'].astype('string')
booked_avg_sec_deposit['availability_90'][0] = "Non-Booked"
booked_avg_sec_deposit['availability_90'][1] = "Booked"
booked_avg_sec_deposit
Out[ ]:
availability_90 security_deposit
0 Non-Booked 193.270365
1 Booked 189.195214
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_sec_deposit,
              x='availability_90', y='security_deposit', color='lightblue')
plot.yaxis.set_major_formatter('${x:1.0f}') # Make y-axis currency
#Give the column chart a title
plt.title("Average Security Deposit Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+3, # y-coordinate position of data label, padded 3 above bar
 s = '${:.2f}'.format(height), # data label format of currency
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
No description has been provided for this image
In [ ]:
booked_sec_deposit = df_prob_2.groupby('availability_90')['security_deposit'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_sec_deposit['availability_90'] = booked_sec_deposit['availability_90'].astype('string')
booked_sec_deposit['availability_90'][0] = "Non-Booked"
booked_sec_deposit['availability_90'][1] = "Booked"
booked_sec_deposit
Out[ ]:
availability_90 min mean median max
0 Non-Booked 0.0 193.270365 100.0 5000.0
1 Booked 0.0 189.195214 0.0 3000.0
In [ ]:
print("Value Counts Non-Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[0])
print("Percentages Non-Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[0] / df_prob_2.groupby('availability_90')['security_deposit'].count()[0])

print("Value Counts Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[1])
print("Percentages Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[1] / df_prob_2.groupby('availability_90')['security_deposit'].count()[1])
Value Counts Non-Booked: security_deposit
0.0       1227
500.0      242
100.0      213
200.0      206
250.0      151
300.0      126
1000.0      82
150.0       78
95.0        56
400.0       41
350.0       28
600.0       12
125.0       11
1500.0      11
750.0       10
2000.0      10
800.0        7
450.0        6
700.0        6
295.0        5
2500.0       5
99.0         4
195.0        3
1200.0       3
120.0        2
129.0        2
175.0        2
225.0        2
550.0        2
850.0        2
900.0        2
3000.0       2
160.0        1
180.0        1
185.0        1
234.0        1
240.0        1
275.0        1
280.0        1
290.0        1
315.0        1
375.0        1
650.0        1
675.0        1
945.0        1
999.0        1
1250.0       1
1600.0       1
1899.0       1
2900.0       1
5000.0       1
Name: security_deposit, dtype: int64
Percentages Non-Booked: security_deposit
0.0       0.475950
500.0     0.093871
100.0     0.082622
200.0     0.079907
250.0     0.058573
300.0     0.048875
1000.0    0.031808
150.0     0.030256
95.0      0.021722
400.0     0.015904
350.0     0.010861
600.0     0.004655
125.0     0.004267
1500.0    0.004267
750.0     0.003879
2000.0    0.003879
800.0     0.002715
450.0     0.002327
700.0     0.002327
295.0     0.001939
2500.0    0.001939
99.0      0.001552
195.0     0.001164
1200.0    0.001164
120.0     0.000776
129.0     0.000776
175.0     0.000776
225.0     0.000776
550.0     0.000776
850.0     0.000776
900.0     0.000776
3000.0    0.000776
160.0     0.000388
180.0     0.000388
185.0     0.000388
234.0     0.000388
240.0     0.000388
275.0     0.000388
280.0     0.000388
290.0     0.000388
315.0     0.000388
375.0     0.000388
650.0     0.000388
675.0     0.000388
945.0     0.000388
999.0     0.000388
1250.0    0.000388
1600.0    0.000388
1899.0    0.000388
2900.0    0.000388
5000.0    0.000388
Name: security_deposit, dtype: float64
Value Counts Booked: security_deposit
0.0       409
500.0      64
200.0      55
100.0      52
300.0      47
150.0      31
250.0      31
1000.0     18
350.0      12
400.0      12
95.0       11
750.0       7
450.0       4
600.0       4
1500.0      4
125.0       3
175.0       3
199.0       3
700.0       3
800.0       3
1200.0      3
3000.0      3
900.0       2
149.0       1
180.0       1
275.0       1
375.0       1
1100.0      1
1250.0      1
1400.0      1
2000.0      1
2400.0      1
2500.0      1
Name: security_deposit, dtype: int64
Percentages Booked: security_deposit
0.0       0.515113
500.0     0.080605
200.0     0.069270
100.0     0.065491
300.0     0.059194
150.0     0.039043
250.0     0.039043
1000.0    0.022670
350.0     0.015113
400.0     0.015113
95.0      0.013854
750.0     0.008816
450.0     0.005038
600.0     0.005038
1500.0    0.005038
125.0     0.003778
175.0     0.003778
199.0     0.003778
700.0     0.003778
800.0     0.003778
1200.0    0.003778
3000.0    0.003778
900.0     0.002519
149.0     0.001259
180.0     0.001259
275.0     0.001259
375.0     0.001259
1100.0    0.001259
1250.0    0.001259
1400.0    0.001259
2000.0    0.001259
2400.0    0.001259
2500.0    0.001259
Name: security_deposit, dtype: float64
In [ ]:
booked_room_type = df_prob_2.groupby('availability_90')['room_type'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_room_type['availability_90'] = booked_room_type['availability_90'].astype('string')
booked_room_type['availability_90'][0] = "Non-Booked"
booked_room_type['availability_90'][1] = "Booked"
booked_room_type
Out[ ]:
availability_90 min mean median max
0 Non-Booked 0 0.330877 0.0 2
1 Booked 0 0.240554 0.0 2
In [ ]:
print("Value Counts:", df_prob_2.groupby('availability_90')['room_type'].value_counts())
print("Percentages:", df_prob_2.groupby('availability_90')['room_type'].value_counts() / df_prob_2.groupby('availability_90')['room_type'].count())
Value Counts: availability_90  room_type
0                0            1777
                 1             749
                 2              52
1                0             616
                 1             165
                 2              13
Name: room_type, dtype: int64
Percentages: availability_90  room_type
0                0            0.689294
                 1            0.290535
                 2            0.020171
1                0            0.775819
                 1            0.207809
                 2            0.016373
Name: room_type, dtype: float64
In [ ]:
booked_host_listings = df_prob_2.groupby('availability_90')['host_listings_count'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_host_listings['availability_90'] = booked_host_listings['availability_90'].astype('string')
booked_host_listings['availability_90'][0] = "Non-Booked"
booked_host_listings['availability_90'][1] = "Booked"
booked_host_listings
Out[ ]:
availability_90 min mean median max
0 Non-Booked 1.0 17.251746 1.0 339.0
1 Booked 1.0 12.255668 1.0 339.0
In [ ]:
df_prob_2_temp = df_prob_2.copy()
df_prob_2_temp['instant_bookable'] = df_prob_2_temp['instant_bookable'].astype('int')
booked_instant = df_prob_2_temp.groupby('availability_90')['instant_bookable'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_instant['availability_90'] = booked_host_listings['availability_90'].astype('string')
booked_instant['availability_90'][0] = "Non-Booked"
booked_instant['availability_90'][1] = "Booked"
booked_instant
Out[ ]:
availability_90 min mean median max
0 Non-Booked 0 0.117145 0.0 1
1 Booked 0 0.103275 0.0 1
In [ ]:
df_prob_2_temp = df_prob_2.copy()
df_prob_2_temp['host_identity_verified'] = df_prob_2_temp['host_identity_verified'].astype('int')
booked_host_ver = df_prob_2_temp.groupby('availability_90')['host_identity_verified'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_host_ver['availability_90'] = booked_host_ver['availability_90'].astype('string')
booked_host_ver['availability_90'][0] = "Non-Booked"
booked_host_ver['availability_90'][1] = "Booked"
booked_host_ver
Out[ ]:
availability_90 min mean median max
0 Non-Booked 0 0.803724 1.0 1
1 Booked 0 0.751889 1.0 1
In [ ]:
df_prob_2_temp = df_prob_2.copy()
df_prob_2_temp['cancellation_policy'] = df_prob_2_temp['cancellation_policy'].astype('int')
print("Value Counts:", df_prob_2_temp.groupby('availability_90')['cancellation_policy'].value_counts())
print("Percentages:", df_prob_2_temp.groupby('availability_90')['cancellation_policy'].value_counts() / df_prob_2_temp.groupby('availability_90')['cancellation_policy'].count())
booked_cancel_policy = df_prob_2_temp.groupby('availability_90')['cancellation_policy'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_cancel_policy['availability_90'] = booked_host_ver['availability_90'].astype('string')
booked_cancel_policy['availability_90'][0] = "Non-Booked"
booked_cancel_policy['availability_90'][1] = "Booked"
print(booked_cancel_policy)
Value Counts: availability_90  cancellation_policy
0                3                      1289
                 1                       673
                 0                       608
                 4                         7
                 2                         1
1                3                       361
                 0                       222
                 1                       210
                 4                         1
Name: cancellation_policy, dtype: int64
Percentages: availability_90  cancellation_policy
0                3                      0.500000
                 1                      0.261055
                 0                      0.235842
                 4                      0.002715
                 2                      0.000388
1                3                      0.454660
                 0                      0.279597
                 1                      0.264484
                 4                      0.001259
Name: cancellation_policy, dtype: float64
  availability_90  min      mean  median  max
0      Non-Booked    0  1.772692     3.0    4
1          Booked    0  1.633501     1.0    4

flexible = 0, moderate = 1, no_refunds = 2, strict = 3, super_strict_30 = 4

In [ ]:
df_temp = df.copy()
df_temp['review_scores_rating'] = df_temp['review_scores_rating'].apply(lambda x: categorize_review_scores(x))
reviews_sec_deposit = df_temp.groupby('review_scores_rating')['security_deposit'].agg(['min', 'mean', 'median', 'max', 'count']).reset_index()
reviews_sec_deposit
Out[ ]:
review_scores_rating min mean median max count
0 2 0.0 0.000000 0.0 0.0 5
1 4 0.0 0.000000 0.0 0.0 1
2 5 0.0 83.333333 0.0 250.0 3
3 6 0.0 134.000000 0.0 1000.0 25
4 7 0.0 233.333333 0.0 3000.0 18
5 8 0.0 143.138365 0.0 2400.0 318
6 9 0.0 198.252498 100.0 5000.0 3002
In [ ]:
print("Value Counts 8:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[8])
print("Percentages 8:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[8] / df_temp.groupby('review_scores_rating')['security_deposit'].count()[8])

print("Value Counts 9:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[9])
print("Percentages 9:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[9] / df_temp.groupby('review_scores_rating')['security_deposit'].count()[9])
Value Counts 8: security_deposit
0.0       186
100.0      30
500.0      21
250.0      18
300.0      15
200.0      12
150.0       7
95.0        5
400.0       5
1000.0      5
175.0       2
700.0       2
125.0       1
195.0       1
234.0       1
240.0       1
800.0       1
1200.0      1
1250.0      1
1899.0      1
2000.0      1
2400.0      1
Name: security_deposit, dtype: int64
Percentages 8: security_deposit
0.0       0.584906
100.0     0.094340
500.0     0.066038
250.0     0.056604
300.0     0.047170
200.0     0.037736
150.0     0.022013
95.0      0.015723
400.0     0.015723
1000.0    0.015723
175.0     0.006289
700.0     0.006289
125.0     0.003145
195.0     0.003145
234.0     0.003145
240.0     0.003145
800.0     0.003145
1200.0    0.003145
1250.0    0.003145
1899.0    0.003145
2000.0    0.003145
2400.0    0.003145
Name: security_deposit, dtype: float64
Value Counts 9: security_deposit
0.0       1412
500.0      281
200.0      245
100.0      235
250.0      160
300.0      158
150.0      102
1000.0      94
95.0        62
400.0       48
350.0       40
750.0       17
600.0       16
1500.0      15
125.0       13
450.0       10
2000.0      10
800.0        9
700.0        7
2500.0       6
295.0        5
1200.0       5
99.0         4
900.0        4
3000.0       4
175.0        3
199.0        3
120.0        2
129.0        2
180.0        2
195.0        2
225.0        2
275.0        2
375.0        2
550.0        2
850.0        2
149.0        1
160.0        1
185.0        1
280.0        1
290.0        1
315.0        1
650.0        1
675.0        1
945.0        1
999.0        1
1100.0       1
1250.0       1
1400.0       1
1600.0       1
2900.0       1
5000.0       1
Name: security_deposit, dtype: int64
Percentages 9: security_deposit
0.0       0.470353
500.0     0.093604
200.0     0.081612
100.0     0.078281
250.0     0.053298
300.0     0.052632
150.0     0.033977
1000.0    0.031312
95.0      0.020653
400.0     0.015989
350.0     0.013324
750.0     0.005663
600.0     0.005330
1500.0    0.004997
125.0     0.004330
450.0     0.003331
2000.0    0.003331
800.0     0.002998
700.0     0.002332
2500.0    0.001999
295.0     0.001666
1200.0    0.001666
99.0      0.001332
900.0     0.001332
3000.0    0.001332
175.0     0.000999
199.0     0.000999
120.0     0.000666
129.0     0.000666
180.0     0.000666
195.0     0.000666
225.0     0.000666
275.0     0.000666
375.0     0.000666
550.0     0.000666
850.0     0.000666
149.0     0.000333
160.0     0.000333
185.0     0.000333
280.0     0.000333
290.0     0.000333
315.0     0.000333
650.0     0.000333
675.0     0.000333
945.0     0.000333
999.0     0.000333
1100.0    0.000333
1250.0    0.000333
1400.0    0.000333
1600.0    0.000333
2900.0    0.000333
5000.0    0.000333
Name: security_deposit, dtype: float64
In [ ]:
df_temp = df.copy()
df_temp['review_scores_rating'] = df_temp['review_scores_rating'].apply(lambda x: categorize_review_scores(x))
reviews_sec_deposit = df_temp.groupby('review_scores_rating')['weekly_price'].agg(['min', 'mean', 'median', 'max', 'count']).reset_index()
reviews_sec_deposit
Out[ ]:
review_scores_rating min mean median max count
0 2 105.0 2312.800000 2450.0 5313.0 5
1 4 3999.0 3999.000000 3999.0 3999.0 1
2 5 220.0 936.666667 525.0 2065.0 3
3 6 175.0 1446.920000 910.0 5593.0 25
4 7 420.0 1574.500000 775.0 7000.0 18
5 8 105.0 1289.025157 800.0 8750.0 318
6 9 95.0 1401.106596 900.0 17843.0 3002
In [ ]:
print("Mode 8:", df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[8])
print("Percentages 8:", max(df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[8] / df_temp.groupby('review_scores_rating')['weekly_price'].count()[8]))

print("Mode 9:", df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[9])
print("Percentages 9:", df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[9] / df_temp.groupby('review_scores_rating')['weekly_price'].count()[9])
Mode 8: weekly_price
1050.0    12
350.0      9
700.0      9
1750.0     9
250.0      7
          ..
6846.0     1
6999.0     1
7500.0     1
8400.0     1
8750.0     1
Name: weekly_price, Length: 153, dtype: int64
Percentages 8: 0.03773584905660377
Mode 9: weekly_price
1050.0     82
700.0      81
1400.0     71
500.0      63
875.0      57
           ..
12565.0     1
12950.0     1
15393.0     1
17500.0     1
17843.0     1
Name: weekly_price, Length: 588, dtype: int64
Percentages 9: weekly_price
1050.0     0.027315
700.0      0.026982
1400.0     0.023651
500.0      0.020986
875.0      0.018987
             ...   
12565.0    0.000333
12950.0    0.000333
15393.0    0.000333
17500.0    0.000333
17843.0    0.000333
Name: weekly_price, Length: 588, dtype: float64
In [ ]:
df_temp = df.copy()
df_temp['review_scores_rating'] = df_temp['review_scores_rating'].apply(lambda x: categorize_review_scores(x))
df_temp['host_is_superhost'] = df_temp['host_is_superhost'].astype('int')
reviews_sec_deposit = df_temp.groupby('review_scores_rating')['host_is_superhost'].agg(['min', 'mean', 'median', 'max', 'count']).reset_index()
reviews_sec_deposit
Out[ ]:
review_scores_rating min mean median max count
0 2 0 0.000000 0.0 0 5
1 4 1 1.000000 1.0 1 1
2 5 0 0.000000 0.0 0 3
3 6 0 0.080000 0.0 1 25
4 7 0 0.055556 0.0 1 18
5 8 0 0.012579 0.0 1 318
6 9 0 0.247835 0.0 1 3002